# Importing required packages

In [24]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
import numpy as np

# Loading the datasets

In [25]:
train=pd.read_csv("train_II.csv")
test=pd.read_csv("test_II.csv")
train.shape

(75383, 2)

Splitting the Compound column given in dataset into Id1 and Id2

In [None]:
train["Id1"]=train["Id"].str.split(";").str[0]
train["Id2"]=train["Id"].str.split(";").str[1]
test["Id1"]=test["x"].str.split(";").str[0]
test["Id2"]=test["x"].str.split(";").str[1]

Extracting smiles for the chemicals

In [26]:
train["Smiles"]=""
test["Smiles"]=""
train["Smiles"] = [Chem.MolFromSmiles(smiles) for smiles in train['Id1']]
test["Smiles"] = [Chem.MolFromSmiles(smiles) for smiles in test['Id1']]

[01:08:33] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:08:35] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:08:37] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:08:37] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:08:39] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:08:40] Explicit valence for atom # 1 Si, 8, is greater than permitted


<class 'pandas.core.frame.DataFrame'>
Int64Index: 75377 entries, 0 to 75382
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Id        75377 non-null  object
 1   Expected  75377 non-null  int64 
 2   Id1       75377 non-null  object
 3   Id2       75377 non-null  object
 4   Smiles    75377 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.5+ MB


Dropping the null columns and resetting the index

In [None]:
train=train.dropna()
test=test.dropna()
train.info()
train=train.reset_index()
train.drop(['index'], axis=1, inplace=True)

Function for extracting the descriptor names and values for them

In [27]:
def RDkit_descriptors(smiles):
    
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in smiles:
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names



Calling the descriptors function for train and test files

In [28]:
desc_train,desc_names_train = RDkit_descriptors(train["Smiles"])
desc_test, desc_names_test = RDkit_descriptors(test["Smiles"])

Creating new dataframe for descriptor values and adding the smiles and id values to that dataframe

In [29]:
test1 = pd.DataFrame(desc_test, columns=desc_names_test)
test1["Id2"] = test["Id2"]
test1["Id1"] = test["Id1"]
test1["Smiles"] =  test["Smiles"]
test1["x"] =  test["x"]
train1 = pd.DataFrame(desc_train, columns=desc_names_train)
train1["Id2"] = train["Id2"]
train1["Id1"] = train["Id1"]
train1["Smiles"] = train["Smiles"]
train1["Id"] =  train["Id"]
train1["Expected"]=train["Expected"]


Exproting the datasets for loading them into Main file

In [31]:
train1.to_csv("train_I.csv", index=False)
test1.to_csv("test_I.csv", index=False)
train1

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Id2,Id1,Smiles,Id,Expected
0,9.316200,-1.533785,9.316200,0.150485,0.794714,317.599,306.511,315.982463,100,0,...,0,0,0,0,0,1644,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,<rdkit.Chem.rdchem.Mol object at 0x7fe02dbc1eb0>,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2
1,10.532611,0.333788,10.532611,0.333788,0.516641,156.269,136.109,156.151415,66,0,...,0,0,0,4,0,2451,CCCCCCCCC(=O)C,<rdkit.Chem.rdchem.Mol object at 0x7fe02dbc1c10>,CCCCCCCCC(=O)C;2451,2
2,2.433032,0.000000,2.433032,0.000000,0.251327,362.086,313.702,361.347528,148,0,...,0,0,0,12,0,1384,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],<rdkit.Chem.rdchem.Mol object at 0x7fe02dbc1a50>,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1384,2
3,10.355080,-0.613825,10.355080,0.282361,0.487998,255.665,245.585,255.052302,90,0,...,0,0,0,0,0,16,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,<rdkit.Chem.rdchem.Mol object at 0x7fe02dbc1c80>,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;16,2
4,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,0,...,0,0,0,0,0,1856,[Na+].[I-],<rdkit.Chem.rdchem.Mol object at 0x7fe02dbc1d60>,[Na+].[I-];1856,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,11.460021,-3.868472,11.460021,0.053611,0.712426,230.245,220.165,230.036128,82,0,...,0,0,0,0,0,33,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N,<rdkit.Chem.rdchem.Mol object at 0x7fe09b2c1510>,COC(=O)NS(=O)(=O)C1=CC=C(C=C1)N;33,2
75373,5.928972,-2.841623,5.928972,0.082346,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,1632,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C,<rdkit.Chem.rdchem.Mol object at 0x7fe09b2c1580>,CCOP(=S)(OCC)OC1=NN(C(=N1)Cl)C(C)C;1632,1
75374,4.975926,0.848333,4.975926,0.848333,0.596343,167.258,162.218,166.986341,50,0,...,1,0,0,0,0,1373,C1=CC=C2C(=C1)NC(=S)S2,<rdkit.Chem.rdchem.Mol object at 0x7fe09b2c15f0>,C1=CC=C2C(=C1)NC(=S)S2;1373,1
75375,10.241948,0.324028,10.241948,0.324028,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,2,CCCCC(CC)C=O,<rdkit.Chem.rdchem.Mol object at 0x7fe09b2c1660>,CCCCC(CC)C=O;2,2
