In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem

In [2]:
logP_data = pd.read_csv('../data/3_final_data/logp.csv')
pharma_data = pd.read_csv('../data/1_filtering/new/pharma.csv', index_col=0)
zinc_data = pd.read_csv('../data/3_final_data/split_data/zinc_dataset_test.csv', index_col=0)

# selecting 10 not measured pharma useful molecules 

In [3]:
pharma_data = pharma_data[(~pharma_data.smiles.isin(logP_data.smiles)) & (pharma_data.smiles.isin(zinc_data.smiles))]

In [4]:
pharma_data

Unnamed: 0,smiles
33910,CCCCC(=O)N[C@@H](C(=O)O)C(C)(C)SN=O
57917,CCCCCCCCCCCC(=O)N[C@@H](CO)C(=O)O
60018,CCCCC(=O)N[C@H](C(=O)O)C(C)(C)SN=O
66224,CN(C)[Si](N(C)C)N(C)C
69114,COC(=O)/C=C/C(F)(F)F
69226,CC/C=C/CO
69474,CCCCCCC/C=C\C(=O)O
70428,CCCCCCCC/C=C\C/C=C\C/C=C\CCCC(=O)O
71106,CCCCC/C=C\C/C=C\CCCCCCCC(=O)NCCO
71276,CCCCCCCCCCC/C=C/C(=O)O


In [5]:
pharma_data = pharma_data.reset_index().drop(columns=['index'])

In [6]:
pharma_data.shape

(38, 1)

In [7]:
np.random.seed = 10
indices = pd.Series(np.random.default_rng().choice(38, size=10, replace=False))

In [8]:
indices

0    20
1     3
2    13
3    34
4    14
5     0
6    19
7    12
8     6
9     7
dtype: int64

In [9]:
pharma_data = pharma_data.loc[indices]

In [10]:
pharma_data

Unnamed: 0,smiles
20,CCCCC(=O)OCC(COC(=O)CCCC)OC(=O)CCCC
3,CN(C)[Si](N(C)C)N(C)C
13,CCCCCCCCCCCC(=O)O[C@H](CC(=O)O)C[N+](C)(C)C
34,CN(C)CCCSC(=N)N
14,CCCCS(=O)(=O)N1CCCCCC1
0,CCCCC(=O)N[C@@H](C(=O)O)C(C)(C)SN=O
19,CCCCNC(=O)NCCCC
12,CCCCCCCCCCCC#CCCCCC(=O)O
6,CCCCCCC/C=C\C(=O)O
7,CCCCCCCC/C=C\C/C=C\C/C=C\CCCC(=O)O


# selecting 10 not measured similar molecules 

In [11]:
logp_mol = [Chem.MolFromSmiles(smi) for smi in logP_data.smiles]

In [12]:
zinc_mol = [Chem.MolFromSmiles(smi) for smi in zinc_data.smiles]

In [13]:
logp_length = pd.Series(mol.GetNumAtoms() for mol in logp_mol)

In [14]:
logp_length.describe()

count    14111.000000
mean        17.574587
std          7.765619
min          5.000000
25%         12.000000
50%         16.000000
75%         21.000000
max         85.000000
dtype: float64

In [15]:
zinc_is_similar = pd.Series(17.574587 - 7.765619 <= e.GetNumAtoms() and e.GetNumAtoms() <= 17.574587 + 7.765619 for e in zinc_mol)

In [16]:
zinc_similar_data = zinc_data.loc[zinc_is_similar]

In [17]:
zinc_similar_data.shape

(136439, 1)

In [18]:
indices = pd.Series(np.random.randint(136439, size=10))

In [19]:
zinc_similar_data = zinc_similar_data.reset_index().drop(columns=['index'])

In [20]:
zinc_similar_data = zinc_similar_data.loc[indices]

In [21]:
zinc_similar_data

Unnamed: 0,smiles
55717,COc1cccc2c1ccn2CC(=O)Nc1n[nH]c2cccc(F)c12
30502,O=C1C[C@]2(CCN=N2)C(=O)N1c1ccc(Cl)cc1
29230,CS(=O)(=O)N(CC(=O)N1CCCCC1)c1ccccc1Cl
80077,O=C1Nc2ccccc2Oc2ccc(/N=C/c3ccccc3O)cc21
7729,C[C@@H]1C[C@H](Nc2cccc(CN3CCC[C@H]3C(N)=O)c2)[...
64920,CCOC(=O)c1ccccc1NC(=O)c1cncc(Br)c1
2718,N=C(N)c1cc(NC2CC2)ccn1
94858,CCCCOc1cc(F)cc(C(=O)O)c1
131079,OCCN1CCN(C2CCN(Cc3ccccn3)CC2)CC1
87437,O=S(=O)(NCc1ccccc1)Nc1ccccc1-n1cccc1


# selecting 10 not measured not similar molecules

In [22]:
zinc_is_not_similar = pd.Series(28 <= e.GetNumAtoms() for e in zinc_mol)

In [23]:
zinc_not_similar_data = zinc_data.loc[zinc_is_not_similar]

In [24]:
zinc_not_similar_data.shape

(123458, 1)

In [25]:
indices = pd.Series(np.random.randint(123458, size=10))

In [26]:
zinc_not_similar_data = zinc_not_similar_data.reset_index().drop(columns=['index'])

In [27]:
zinc_not_similar_data = zinc_not_similar_data.loc[indices]

In [28]:
zinc_not_similar_data

Unnamed: 0,smiles
11679,CCOc1cc(/C=C\C(=O)OCC(=O)N(CC)[C@H]2CCS(=O)(=O...
21833,Cc1cc2c(c(O)c1-c1c(C)cc3c(c1O)/C(=C/Nc1ccccc1C...
30468,O=C(CSc1nnc(-c2ccc(Cl)cc2)o1)N(N=Cc1ccccc1)c1c...
61729,CCc1ccccc1NC(=O)c1sc2nc3c(c(-c4ccco4)c2c1N)CCCC3
57198,CC1CCN(c2ccc(NC(=O)C3CCN(c4ncccn4)CC3)cc2C(N)=...
61136,Cc1ccc(COc2ccc(/C=N/N3C(=O)[C@H]4[C@H]5C=C[C@@...
16824,CCOC(=O)c1c(CN(CC)CC)n(C)c2c1cc(OC(C)=O)c1c2c(...
10811,Cc1cc(C)c2[nH]c(=O)c(CN(C[C@@H]3CCCO3)C(=S)N[C...
8603,CCOc1ccc(N(CC(=O)Nc2c(C)cc(C)cc2C)S(=O)(=O)c2c...
14866,CCOc1c(Br)cc(C=Nn2c([C@@H](C)CC)nc3ccc(Br)cc3c...


# concatenating and dropping

In [29]:
measure_molecules = [pharma_data, zinc_similar_data, zinc_not_similar_data]

In [30]:
measure_molecules = pd.concat(measure_molecules)

In [31]:
measure_molecules

Unnamed: 0,smiles
20,CCCCC(=O)OCC(COC(=O)CCCC)OC(=O)CCCC
3,CN(C)[Si](N(C)C)N(C)C
13,CCCCCCCCCCCC(=O)O[C@H](CC(=O)O)C[N+](C)(C)C
34,CN(C)CCCSC(=N)N
14,CCCCS(=O)(=O)N1CCCCCC1
0,CCCCC(=O)N[C@@H](C(=O)O)C(C)(C)SN=O
19,CCCCNC(=O)NCCCC
12,CCCCCCCCCCCC#CCCCCC(=O)O
6,CCCCCCC/C=C\C(=O)O
7,CCCCCCCC/C=C\C/C=C\C/C=C\CCCC(=O)O


In [32]:
measure_molecules = measure_molecules.reset_index().drop(columns=['index'])

In [33]:
measure_molecules.to_csv('../data/3_final_data/biocad_measurements.csv')

In [34]:
zinc_data = zinc_data[~zinc_data.smiles.isin(measure_molecules.smiles)]

In [35]:
zinc_data.shape

(299950, 1)

In [36]:
zinc_data.to_csv('../data/3_final_data/split_data/zinc_dataset_test.csv')