## Import Dependencies

### General
* Pandas
* Numpy
* Seaborn

### Datasets
* TDC Tox

### RDKit Modules
* AllChem
* rdMolDescriptors
* IPythonConsole
* Draw
* DataStructs
* Butina

In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
#---------------------- Therapeutic Drug Commons (TDC data) from https://tdcommons.ai/single_pred_tasks/tox/#dili-drug-induced-liver-injury
from tdc.single_pred import Tox
#---------------------- RDKit packages
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
#-
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import time
from sklearn.model_selection import ShuffleSplit, cross_validate,train_test_split


## Data cleaning

### Reading, converting to pandas

Read TDC Tox DILI Dataset & convert to Pandas dataframe.\
Rename columns to be more human-readable.

In [2]:
# Make this a function that works for multiple datasets, concatenates them

tox_data = Tox(name = 'DILI')

tox_df = tox_data.get_data()

tox_df.columns = ["X", "SMILES", "DILI?"]

Found local copy...
Loading...
Done!


### Append skeleton structures

Generate RDKit molecular structure and append skeleton diagram as a column to the dataset.

In [3]:

# Get RDKit molecular structure
from rdkit.Chem import PandasTools
PandasTools.AddMoleculeColumnToFrame(tox_df, 'SMILES', 'Structure')

# Display RDKit molecule skeleton - Use Ipython to fix broken pandas functionality (doesn't display skeleton in HTML table)
from IPython.display import HTML
print(tox_df.head)

<bound method NDFrame.head of               X                                             SMILES  DILI?  \
0         187.0                               CC(=O)OCC[N+](C)(C)C    0.0   
1         247.0                              C[N+](C)(C)CC(=O)[O-]    0.0   
2         298.0       O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl    0.0   
3         338.0                                    O=C(O)c1ccccc1O    0.0   
4         444.0                     CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1    0.0   
..          ...                                                ...    ...   
470  23663956.0           CCCC(CCC)C(=O)O.CCCC(CCC)C(=O)[O-].[Na+]    1.0   
471  23673837.0  CCCCC(CC)COC(=O)CC(C(=O)OCC(CC)CCCC)S(=O)(=O)[...    0.0   
472  54675785.0  C=C1c2cccc(O)c2C(O)=C2C(=O)C3(O)C(O)=C(C(N)=O)...    1.0   
473  54678501.0                             O=C1OC(C(O)CO)C(O)=C1O    0.0   
474  54680690.0  CN(C)C1C(=O)C(C(N)=O)=C(O)C2(O)C(=O)C3=C(O)c4c...    1.0   

                                             

# Get Fingerprints

Define function 'generate_fingerprints'
Initialise empty list of Morgan fingerprints
for molecules in a given dataframe, generate their morgan fingerprints and append them to the dataframe
Reutrn appended dataframe as numpy array to analyse using 'shape'

Run generate_fingerprints on each molecule in the dataframe

Use shape to confirm success - First number should equal dataframe length


In [4]:
from rdkit import DataStructs

def computeMorganFP(mol, depth = 2, nBits = 2048):
    a = np.zeros(nBits, int)
    DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(mol,depth,nBits),a)
    return a


tox_df["m3fp"] = tox_df["Structure"].map(computeMorganFP)

In [5]:
morgan_df = tox_df["m3fp"].apply(pd.Series)

morgan_df.insert(2048, "DILI?", tox_df["DILI?"].astype(int))


print(morgan_df.describe)

<bound method NDFrame.describe of      0  1  2  3  4  5  6  7  8  9  ...  2039  2040  2041  2042  2043  2044  \
0    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2    0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4    0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
..  .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
470  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
471  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
472  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
473  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
474  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

     2045  2046  2047  DILI? 

In [6]:
# Instantiate a model

model_rf = RandomForestRegressor(random_state=20)

In [23]:
cross_validation = ShuffleSplit(n_splits=5, test_size=0.3, random_state=20) #5-fold precedended in AbdulHameed, Random state is like a seed?

scoring = ['r2', 'neg_root_mean_squared_error']

morgan_df.columns = morgan_df.columns.astype(str)

scores = cross_validate(model_rf, morgan_df.iloc[:, 0:len(morgan_df.columns)-1], morgan_df["DILI?"], scoring=scoring, cv=cross_validation)

print(morgan_df.describe)

#print(scores)

scores["test_r2"]
-scores["test_neg_root_mean_squared_error"]

print("R^2 :", round(np.mean(scores['test_r2']), 2))
print("MAE :", round(np.mean(-scores["test_neg_root_mean_squared_error"]), 2))

<bound method NDFrame.describe of      0  1  2  3  4  5  6  7  8  9  ...  2039  2040  2041  2042  2043  2044  \
0    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2    0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3    0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4    0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
..  .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
470  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
471  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
472  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
473  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
474  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

     2045  2046  2047  DILI? 

## Thanks To

https://www.youtube.com/watch?v=-oHqQBUyrQ0