# Preprocessing

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem

In [2]:
df1 = pd.read_csv('../data/HDAC6_clean.csv', delimiter = '	')[['Ligand SMILES', 'pIC50']]
df1

Unnamed: 0,Ligand SMILES,pIC50
0,Cc1cccc(NC(=O)[C@H](CCCCCS)NC(=O)[C@@H]2CCCC(=...,-0.00000
1,ONC(=O)c1cc(F)c2CN(CCc2c1)C1CCC1,-0.00000
2,CN1CC2(CCCCC2)CC[C@H]1Cc1cc(F)cc(c1)C(=O)NO,-0.00000
3,SCCCCC[C@H](NC(=O)[C@@H]1CCCNC1=O)C(=O)Nc1ccccc1,-0.00000
4,Cn1c2CN(Cc3ccc(cc3F)C(=O)NO)CCc2c2ccccc12,-0.00000
...,...,...
5360,CC1(C)CCCCN1C(=O)[C@@H]1CCC(=O)N1c1ccc(-c2noc(...,-1.80618
5361,FC(F)(F)c1nc(no1)-c1ccc(CNC2CCCCNC2=O)nc1,-1.80618
5362,COc1cccc(NCc2ccc(cc2)-c2noc(n2)C(F)(F)F)c1,-1.80618
5363,CN1CCN(CC(=O)N(Cc2ccc(cc2)-c2noc(n2)C(F)(F)F)c...,-1.80618


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5365 entries, 0 to 5364
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Ligand SMILES  5365 non-null   object 
 1   pIC50          5365 non-null   float64
dtypes: float64(1), object(1)
memory usage: 84.0+ KB


In [4]:
with pd.option_context(
        'display.max_columns', None,
        'display.max_rows', None,
        'display.width', 1000
    ):
    print(df1.describe(include='all'))

                                            Ligand SMILES        pIC50
count                                                5365  5365.000000
unique                                               5365          NaN
top     Cc1cccc(NC(=O)[C@H](CCCCCS)NC(=O)[C@@H]2CCCC(=...          NaN
freq                                                    1          NaN
mean                                                  NaN    -2.006313
std                                                   NaN     1.036727
min                                                   NaN    -5.999696
25%                                                   NaN    -2.611723
50%                                                   NaN    -1.838849
75%                                                   NaN    -1.322219
max                                                   NaN     2.698970


In [5]:
df2 = pd.read_csv('../data/HDAC6_clean_full.csv', delimiter = ',')
df2

Unnamed: 0,Ligand SMILES,pIC50
0,Nc1ccccc1NC(=O)c1ccc(CNC(=O)c2[nH]c(cc2-c2cccc...,5.301030
1,Nc1ccccc1NC(=O)\C=C\c1ccc(cc1)-c1ccc2ncnc(Nc3c...,4.494850
2,ONC(=O)c1ccc(NCc2nc(nn2-c2ccccc2)-c2ccccc2Cl)cc1,6.732828
3,COc1cc2COc3ccccc3NC(=O)[C@@H](CCCCCC(=O)NO)OC\...,8.000000
4,FC(F)(F)c1nc(no1)-c1ccc(cc1)N1[C@@H](CCC1=O)c1...,8.060481
...,...,...
7121,FC(F)c1nnc(o1)-c1cnc(CNc2cccnc2)s1,5.823909
7122,Nc1nc(-c2ccco2)c2cnn(CCc3ccc(cc3)C(=O)Nc3ccccc...,5.000000
7123,CCc1nnc(o1)[C@@H]1CCC(=O)N1c1cc(F)c(cc1F)-c1no...,8.000000
7124,CC(c1ccc(cc1)C(=O)Nc1ccc(F)cc1N)n1nc(ccc1=O)-c...,5.000000


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7126 entries, 0 to 7125
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Ligand SMILES  7126 non-null   object 
 1   pIC50          7126 non-null   float64
dtypes: float64(1), object(1)
memory usage: 111.5+ KB


In [7]:
with pd.option_context(
        'display.max_columns', None,
        'display.max_rows', None,
        'display.width', 1000
    ):
    print(df2.describe(include='all'))

                                            Ligand SMILES        pIC50
count                                                7126  7126.000000
unique                                               7126          NaN
top     Nc1ccccc1NC(=O)c1ccc(CNC(=O)c2[nH]c(cc2-c2cccc...          NaN
freq                                                    1          NaN
mean                                                  NaN     6.792664
std                                                   NaN     1.229701
min                                                   NaN     2.000000
25%                                                   NaN     6.004365
50%                                                   NaN     7.065502
75%                                                   NaN     7.662050
max                                                   NaN    11.698970


In [8]:
# Очистка и канонизация SMILES
merged = pd.concat([df1, df2], ignore_index=True)
merged["Ligand SMILES"] = merged["Ligand SMILES"].convert_dtypes().apply(
    lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), canonical=True)
)
cleaned = merged.drop_duplicates(subset="Ligand SMILES", ignore_index=True)
cleaned

Unnamed: 0,Ligand SMILES,pIC50
0,Cc1cccc(NC(=O)[C@H](CCCCCS)NC(=O)[C@@H]2CCCC(=...,-0.000000
1,O=C(NO)c1cc(F)c2c(c1)CCN(C1CCC1)C2,-0.000000
2,CN1CC2(CCCCC2)CC[C@H]1Cc1cc(F)cc(C(=O)NO)c1,-0.000000
3,O=C1NCCC[C@H]1C(=O)N[C@@H](CCCCCS)C(=O)Nc1ccccc1,-0.000000
4,Cn1c2c(c3ccccc31)CCN(Cc1ccc(C(=O)NO)cc1F)C2,-0.000000
...,...,...
7253,O=C1OC[C@H](c2ccccc2)N1c1ncc(-c2nnc(C(F)F)o2)s1,5.047208
7254,FC(F)c1nnc(-c2cnc(CNc3cccnc3)s2)o1,5.823909
7255,Nc1nc(-c2ccco2)c2cnn(CCc3ccc(C(=O)Nc4ccccc4N)c...,5.000000
7256,CC(c1ccc(C(=O)Nc2ccc(F)cc2N)cc1)n1nc(-c2ccc(CN...,5.000000


In [None]:
# cleaned.to_csv('../data/HDAC6_cleaned.csv')

In [9]:
filtered = cleaned[cleaned["pIC50"] >= 8.7].sort_values(by="pIC50", ascending=False, ignore_index=True)
filtered

Unnamed: 0,Ligand SMILES,pIC50
0,CC(C)(C)OC(=O)Nc1ccc(-c2cc(C(=O)NCCCCCCC(=O)NO...,11.69897
1,CC(C)(C)OC(=O)Nc1cccc(-c2csc(NC(=O)CCCCCC(=O)N...,9.769551
2,O=C(NO)c1cnc(N[C@H](c2ccccc2)C2CC2)c(F)c1,9.560667
3,O=C(NO)c1ccc(CNc2cccc3cccnc23)cc1,9.537602
4,O=C(/C=C/c1cccc(S(=O)(=O)c2ccc3ccccc3n2)c1)NO,9.536107
5,O=C(NO)c1cnc(N[C@@H](c2ccccn2)C2CC2)c(F)c1,9.454693
6,O=C(CCCCCCC(=O)N/N=C/c1ccc(N(c2ccccc2)c2ccccc2...,9.39794
7,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,9.39794
8,O=C(NO)c1cnc(NC2(c3c(F)cccc3F)CC2)c(F)c1,9.371611
9,COc1cc2nc(Nc3ccc(NC(=O)CCCCCCC(=O)NO)cc3)nc(NC...,9.30103


In [10]:
# filtered.to_csv('../data/HDAC6_filtered.csv', sep=';')

In [11]:
filtered.iloc[0]['Ligand SMILES']

'CC(C)(C)OC(=O)Nc1ccc(-c2cc(C(=O)NCCCCCCC(=O)NO)no2)cc1'

In [12]:
df3 = pd.read_excel('../data/HDAC6 screening dataset using tau-based substrate in an enzymatic assay yields selective inhibitors and activators.xlsx')
df_ic50 = df3[df3['TYPE'] == 'IC50']
df_ic50

Unnamed: 0,DATA SET,INTERNAL NAME,TYPE,RELATION,VALUE,UNITS,NAME,SMILES
0,4,SPE_K19958299,IC50,=,0.001551,uM,TCS-2210,O=C(NO)C3=CC1=C(C=C3)NC(C(CCCC2=CC=CC=C2)=N1)=O
2532,4,SPE_K82928847,IC50,=,0.003710,uM,ACY-1215,ONC(=O)CCCCCCNC(=O)c1cnc(nc1)N(c1ccccc1)c1ccccc1
3323,4,SPE_K00627859,IC50,=,0.007441,uM,Tubastatin A hydrochloride,CN1CCc2c(C1)c1ccccc1n2Cc1ccc(cc1)C(=O)NO
3459,4,SPE_K22503835,IC50,=,0.008574,uM,scriptaid,O=C(CCCCCN1C(=O)c2cccc3cccc(c23)C1=O)NO
3680,4,SPE_K68202742,IC50,=,0.008734,uM,Trichostatin A,CN(C1=CC=C(C=C1)C([C@@H](/C=C(/C=C/C(NO)=O)C)C...
...,...,...,...,...,...,...,...,...
5715,4,SPE_K25311561,IC50,=,4.798000,uM,KU-55933,O=C1C=C(C3=CC=CC4=C3SC5=C(C=CC=C5)S4)OC(N2CCOC...
5716,4,SPE_K55127134,IC50,=,5.624000,uM,FLUPHENAZINE HYDROCHLORIDE,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(cc23)C(F)(F)F)CC1
5717,4,SPE_K94146583,IC50,=,6.159000,uM,Didox,O=C(NO)C1=CC=C(O)C(O)=C1
5718,4,SPE_K86687746,IC50,=,10.360000,uM,R306465,ONC(=O)c1cnc(nc1)N1CCN(CC1)S(=O)(=O)c1ccc2cccc...


In [13]:
def filter_by_pIC50(df, value_column="VALUE", threshold=8.7):
    """
    Фильтрует DataFrame по порогу pIC50.
    
    Аргументы:
        df — исходный pandas DataFrame
        value_column — имя колонки с IC50 в µM
        threshold — числовой порог (например, 8.7)

    Возвращает:
        DataFrame с pIC50 ≥ threshold
    """
    # Перевод из µM в M
    df["IC50_M"] = df[value_column] * 1e-6

    # Вычисление pIC50
    df["pIC50"] = -np.log10(df["IC50_M"])

    # Фильтрация по порогу
    filtered = df[df["pIC50"] >= threshold].copy()
    
    return filtered

df_filtered = filter_by_pIC50(df_ic50)
df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["IC50_M"] = df[value_column] * 1e-6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["pIC50"] = -np.log10(df["IC50_M"])


Unnamed: 0,DATA SET,INTERNAL NAME,TYPE,RELATION,VALUE,UNITS,NAME,SMILES,IC50_M,pIC50
0,4,SPE_K19958299,IC50,=,0.001551,uM,TCS-2210,O=C(NO)C3=CC1=C(C=C3)NC(C(CCCC2=CC=CC=C2)=N1)=O,1.551e-09,8.809388


In [14]:
# filtered["Ligand SMILES"].to_csv("../data/REINVENT4_mol2mol_hdac6_active.smi", index=False, header=False)

# REINVENT4 + AiZynthFinder + QSAR-pIC₅₀ в Reward

In [None]:
# Score
# │        ▄▄▄▄▄▄▄▄▄
# │      ▄         ▄
# │    ▄             ▄
# │  ▄                 ▄
# └───────────────────────────────▶ TPSA
#    20    50    70    90    140

$$\text{Reward} = \left( \prod_{i=1}^{N} S_i^{w_i} \right)^{\frac{1}{\sum_{i=1}^{N} w_i}}$$

где:  
- $S_i$ — скор компонента после трансформации  
- $w_i$ — вес компонента  
- $N$ — число активных компонентов (вес $> 0$ )  

**OR**

$$\text{Reward} = \left(\prod_{i=1}^{N} \bigl(T_{i}(x_{i})\bigr)^{w_{i}}\right)^{\frac{1}{\sum_{i=1}^{N}w_{i}}} \ =\  \exp\ \Biggl(\frac{\sum_{i=1}^{N}w_{i}\,\ln\bigl(T_{i}(x_{i})\bigr)}{\sum_{i=1}^{N}w_{i}}\Biggr)$$

где:  
- $x_{i}$ — сырое значение $i$-го дескриптора,  
- $T_{i}(x_{i})$ — трансформация ($\text{sigmoid}$, $\text{step}$, $\text{value mapping}$ и т. д.) нормирующая $x_{i}$ в $(0,1]$,  
- $w_i$ — вес компонента,  
- $N$ — число активных компонентов (вес $> 0$ )  

## retrosynth

In [None]:
from aizynthfinder.aizynthfinder import AiZynthFinder

# загрузка сгенерированных SMILES
df = pd.read_csv("../modules/REINVENT4/sampled_smiles.csv")
top_smiles = df.sort_values("score", ascending=False).smiles.head(500)

# настройка планировщика
finder = AiZynthFinder(configfile="config.yml")
finder.load_tree_search_model()
finder.load_stock("zinc_stock")

# собираем статистику
results = []
for smi in top_smiles:
    finder.target_smiles = smi
    finder.run()
    routes = finder.routes
    min_steps = min([len(r.reactants) for r in routes]) if routes else None
    results.append({"smiles": smi, "min_steps": min_steps})

pd.DataFrame(results).to_csv("../data/retrosynth_summary.csv", index=False)

# pipeline «HDAC6 + синтезируемость + конформации в кармане»

1. Генерация декого набора SMILES под HDAC6 (REINVENT4 или TransferLearning).

2. Для каждого SMILES:

	a) EquiBind → сразу 3D-поза в HDAC6-крышке.  
	b) (опционально) RDKit ETKDG × Smina + MMFF94 локальная оптимизация

3. Расчёт pIC₅₀-reward (PredictiveModelComponent) + SAscore.

4. RL-цикл или отбор по combined_score.

5. Дополнительная верификация синтеза – AiZynthFinder на top-100 связок.

6. Парето-фронт: активность VS число шагов синтеза (или SAS).

In [None]:
# check conflicts
import torch, torchvision
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
print("Torchvision:", torchvision.__version__)
print("NMS check:", type(torchvision.ops.nms))


In [None]:
# reinvent -l sampling.log sampling.toml
# reinvent -l scoring.log scoring.toml