In [25]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

# Importing rdkit to extract chemical features of the drugs
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski


In [26]:
sml = pd.read_csv('smiles.csv')
sml.head()

Unnamed: 0,drug_name,smiles
0,Bardoxolone methyl,CC1(CCC2(CCC3(C(C2C1)C(=O)C=C4C3(CCC5C4(C=C(C(...
1,Carfilzomib,CC(C)CC(C(=O)C1(CO1)C)NC(=O)C(CC2=CC=CC=C2)NC(...
2,LLL-12,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCC=CC...
3,NCGC00090797-11,CC1C(C(CC(O1)OC2C(OC(CC2O)OC3C(OC(CC3O)OC4CCC5...
4,Navitoclax,CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(...


In [27]:
NCAT=pd.read_csv('NCAT.csv')
NCAT.head()

Unnamed: 0,block_id,drug_row,drug_col,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,...,S_mean,S_max,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss,drug_row_clinical_phase,drug_col_clinical_phase,drug_row_target_name,drug_col_target_name
0,457862,NCGC00090797-11,Carfilzomib,PANC-1,NCATS_2D_3D,pancreas,uM,uM,0.5,0.003405,...,-0.0005,-0.001,-1.960323,-0.000388,-0.000895,-1.960796,0,4,\N,Proteasome subunit beta type-8; 26S proteasome...
1,457863,Bardoxolone methyl,Carfilzomib,PANC-1,NCATS_2D_3D,pancreas,uM,uM,20.0,0.003405,...,-0.0005,-0.001,-1.960323,-0.000327,-0.000895,-1.960796,3,4,Nuclear factor erythroid 2-related factor 2; P...,Proteasome subunit beta type-8; 26S proteasome...
2,457864,LLL-12,Carfilzomib,PANC-1,NCATS_2D_3D,pancreas,uM,uM,50.0,0.003405,...,-0.0005,-0.001,-1.960323,-0.000314,-0.000895,-1.960796,0,4,\N,Proteasome subunit beta type-8; 26S proteasome...
3,457865,Navitoclax,Carfilzomib,PANC-1,NCATS_2D_3D,pancreas,uM,uM,20.0,0.003405,...,-0.0005,-0.001,-1.960323,-0.000327,-0.000895,-1.960796,2,4,Apoptosis regulator Bcl-X; Bcl-xL/Bcl-2-like p...,Proteasome subunit beta type-8; 26S proteasome...
4,457866,Bardoxolone methyl,NCGC00090797-11,PANC-1,NCATS_2D_3D,pancreas,uM,uM,20.0,0.003405,...,-0.0005,-0.001,-1.960323,-0.000327,-0.000895,-1.960796,3,0,Nuclear factor erythroid 2-related factor 2; P...,\N


In [28]:
NCAT.columns

Index(['block_id', 'drug_row', 'drug_col', 'cell_line_name', 'study_name',
       'tissue_name', 'conc_row_unit', 'conc_col_unit', 'ic50_row', 'ic50_col',
       'ri_row', 'ri_col', 'css_row', 'css_col', 'css_ri', 'S_sum', 'S_mean',
       'S_max', 'synergy_zip', 'synergy_loewe', 'synergy_hsa', 'synergy_bliss',
       'drug_row_clinical_phase', 'drug_col_clinical_phase',
       'drug_row_target_name', 'drug_col_target_name'],
      dtype='object')

In [29]:
NCAT.drop('block_id',inplace=True,axis=1)
NCAT.drop_duplicates(inplace=True)

In [30]:
#defining and calling lipinski function to extract the molecular feautures of the smiles dataset
def lipinski(sml, verbose=False):

    mdata= []
    for elem in sml:
        mol=Chem.MolFromSmiles(elem) 
        mdata.append(mol)
       
    finalData= np.arange(1,1)
    i=0  
    for mol in mdata:        
        MolWt = Descriptors.MolWt(mol)
        MolLogP = Descriptors.MolLogP(mol)
        MolFinger=Chem.RDKFingerprint(mol)
        NumHDonors = Lipinski.NumHDonors(mol)
        NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([MolWt,
                        MolLogP,
                        MolFinger,
                        NumHDonors,
                        NumHAcceptors])   
    
        if(i==0):
            finalData=row
        else:
            finalData=np.vstack([finalData, row])
        i=i+1      
    
    columnNames=["MW","LogP","MolFinger","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=finalData,columns=columnNames)
    
    return descriptors

df_lipinski = lipinski(sml.smiles) # storing features in a dataframe

df_lipinski.head(2)

Unnamed: 0,MW,LogP,MolFinger,NumHDonors,NumHAcceptors
0,505.699,6.37898,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,5
1,719.924,2.5835,"[0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...",4,8


In [31]:
#Combining individual drugs and their molecular features
sml1 = pd.concat([sml, df_lipinski], axis=1) 
sml1

Unnamed: 0,drug_name,smiles,MW,LogP,MolFinger,NumHDonors,NumHAcceptors
0,Bardoxolone methyl,CC1(CCC2(CCC3(C(C2C1)C(=O)C=C4C3(CCC5C4(C=C(C(...,505.699,6.37898,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,5
1,Carfilzomib,CC(C)CC(C(=O)C1(CO1)C)NC(=O)C(CC2=CC=CC=C2)NC(...,719.924,2.5835,"[0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, ...",4,8
2,LLL-12,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCC=CC...,879.405,17.4251,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0,6
3,NCGC00090797-11,CC1C(C(CC(O1)OC2C(OC(CC2O)OC3C(OC(CC3O)OC4CCC5...,780.949,2.2181,"[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, ...",6,14
4,Navitoclax,CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(...,974.634,8.8332,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, ...",2,11
5,5-Fluorouracil,C1=C(C(=O)NC(=O)N1)F,130.078,-0.7977,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",2,2
6,Veliparib,CC1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N,244.298,1.2604,"[1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...",3,3
7,Decitabine,C1C(C(OC1N2C=NC(=NC2=O)N)CO)O,228.208,-2.1388,"[1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, ...",3,8
8,Selumetinib,CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...,457.687,3.5256,"[1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, ...",3,6
9,homoharringtonine,CC12C(C(=CC13CCCN3CCC4=CC5=C(C=C24)OCO5)OC)OC(...,545.629,2.6664,"[1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, ...",3,9


In [32]:
sml1.to_csv('smiles_with_mol.csv',index=False) # saving the output in a csv file for later use

sml1 = pd.read_csv('smiles_with_mol.csv')

In [33]:
#Calculating similarity scores based on the fingerprints extracted for each molecule
from rdkit import DataStructs
similarity=[]
for i in range(0,len(sml1.drug_name)):
    for j in range(0,len(sml1.drug_name)):
        if i==j:
            continue
        x1= Chem.MolFromSmiles(sml1.smiles[i])
        y1= Chem.MolFromSmiles(sml1.smiles[j])
    #print(df_final5.smiles_row[i],df_final5.smiles_col[i])
        x=Chem.RDKFingerprint(x1)
        y=Chem.RDKFingerprint(y1)
        #print(x,y)
        similarity.append(round(DataStructs.FingerprintSimilarity(x,y),3))
print(similarity)

[0.272, 0.207, 0.374, 0.291, 0.088, 0.201, 0.204, 0.252, 0.31, 0.229, 0.202, 0.253, 0.256, 0.309, 0.272, 0.133, 0.27, 0.383, 0.091, 0.267, 0.231, 0.332, 0.418, 0.336, 0.303, 0.327, 0.306, 0.404, 0.207, 0.133, 0.222, 0.108, 0.048, 0.1, 0.142, 0.102, 0.133, 0.097, 0.08, 0.098, 0.091, 0.11, 0.374, 0.27, 0.222, 0.317, 0.068, 0.25, 0.245, 0.307, 0.378, 0.26, 0.242, 0.268, 0.294, 0.32, 0.291, 0.383, 0.108, 0.317, 0.094, 0.362, 0.265, 0.488, 0.586, 0.441, 0.421, 0.461, 0.466, 0.701, 0.088, 0.091, 0.048, 0.068, 0.094, 0.094, 0.098, 0.108, 0.092, 0.105, 0.093, 0.109, 0.099, 0.096, 0.201, 0.267, 0.1, 0.25, 0.362, 0.094, 0.212, 0.393, 0.372, 0.327, 0.346, 0.326, 0.355, 0.378, 0.204, 0.231, 0.142, 0.245, 0.265, 0.098, 0.212, 0.245, 0.286, 0.244, 0.22, 0.255, 0.238, 0.273, 0.252, 0.332, 0.102, 0.307, 0.488, 0.108, 0.393, 0.245, 0.512, 0.418, 0.4, 0.431, 0.462, 0.507, 0.31, 0.418, 0.133, 0.378, 0.586, 0.092, 0.372, 0.286, 0.512, 0.48, 0.428, 0.472, 0.513, 0.632, 0.229, 0.336, 0.097, 0.26, 0.441, 0.1

In [34]:
#Creating a  matrix containing all drug combinations 
Matrix1=[]
s1=sml1.values.tolist()
for i in range(0,len(sml1.drug_name)):
    for j in range(0,len(sml1.drug_name)):
        if (i==j):
            continue
        Matrix1.append([sml1.drug_name[i],sml1.drug_name[j]])
print(Matrix1)

[['Bardoxolone methyl', 'Carfilzomib'], ['Bardoxolone methyl', 'LLL-12'], ['Bardoxolone methyl', 'NCGC00090797-11'], ['Bardoxolone methyl', 'Navitoclax'], ['Bardoxolone methyl', '5-Fluorouracil'], ['Bardoxolone methyl', 'Veliparib'], ['Bardoxolone methyl', 'Decitabine'], ['Bardoxolone methyl', 'Selumetinib'], ['Bardoxolone methyl', 'homoharringtonine'], ['Bardoxolone methyl', 'Barasertib'], ['Bardoxolone methyl', 'Lenvatinib'], ['Bardoxolone methyl', 'Pazopanib'], ['Bardoxolone methyl', 'Vemurafenib'], ['Bardoxolone methyl', 'Venetoclax'], ['Carfilzomib', 'Bardoxolone methyl'], ['Carfilzomib', 'LLL-12'], ['Carfilzomib', 'NCGC00090797-11'], ['Carfilzomib', 'Navitoclax'], ['Carfilzomib', '5-Fluorouracil'], ['Carfilzomib', 'Veliparib'], ['Carfilzomib', 'Decitabine'], ['Carfilzomib', 'Selumetinib'], ['Carfilzomib', 'homoharringtonine'], ['Carfilzomib', 'Barasertib'], ['Carfilzomib', 'Lenvatinib'], ['Carfilzomib', 'Pazopanib'], ['Carfilzomib', 'Vemurafenib'], ['Carfilzomib', 'Venetoclax'], 

In [35]:
df_intial=pd.DataFrame(Matrix1,columns=['drug_row','drug_col'])
Matrix1

[['Bardoxolone methyl', 'Carfilzomib'],
 ['Bardoxolone methyl', 'LLL-12'],
 ['Bardoxolone methyl', 'NCGC00090797-11'],
 ['Bardoxolone methyl', 'Navitoclax'],
 ['Bardoxolone methyl', '5-Fluorouracil'],
 ['Bardoxolone methyl', 'Veliparib'],
 ['Bardoxolone methyl', 'Decitabine'],
 ['Bardoxolone methyl', 'Selumetinib'],
 ['Bardoxolone methyl', 'homoharringtonine'],
 ['Bardoxolone methyl', 'Barasertib'],
 ['Bardoxolone methyl', 'Lenvatinib'],
 ['Bardoxolone methyl', 'Pazopanib'],
 ['Bardoxolone methyl', 'Vemurafenib'],
 ['Bardoxolone methyl', 'Venetoclax'],
 ['Carfilzomib', 'Bardoxolone methyl'],
 ['Carfilzomib', 'LLL-12'],
 ['Carfilzomib', 'NCGC00090797-11'],
 ['Carfilzomib', 'Navitoclax'],
 ['Carfilzomib', '5-Fluorouracil'],
 ['Carfilzomib', 'Veliparib'],
 ['Carfilzomib', 'Decitabine'],
 ['Carfilzomib', 'Selumetinib'],
 ['Carfilzomib', 'homoharringtonine'],
 ['Carfilzomib', 'Barasertib'],
 ['Carfilzomib', 'Lenvatinib'],
 ['Carfilzomib', 'Pazopanib'],
 ['Carfilzomib', 'Vemurafenib'],
 ['Ca

In [36]:
df_intial.drop_duplicates(inplace=True)

In [37]:
#Combining all the feautures of drug1 to final dataset
df_intial1 =  pd.merge(df_intial , sml1 ,how="left",left_on=['drug_row'],right_on=['drug_name'])

In [38]:
#Combining all the feautures of drug2 to final dataset
df_intial2 =  pd.merge(df_intial1 , sml1 ,how="left",left_on=['drug_col'],right_on=['drug_name'])

In [39]:
fingerprint=pd.DataFrame(similarity,columns=['similarity'])

df_final=pd.concat([df_intial2,fingerprint], axis=1)

In [40]:
labels = ['drug_row','drug_col']

In [41]:
df_fin =  pd.merge(NCAT , df_final ,how="left",left_on=['drug_row','drug_col'],right_on=['drug_row','drug_col'])

In [42]:
df_fin.dropna(inplace=True)

In [43]:
df_fin.columns

Index(['drug_row', 'drug_col', 'cell_line_name', 'study_name', 'tissue_name',
       'conc_row_unit', 'conc_col_unit', 'ic50_row', 'ic50_col', 'ri_row',
       'ri_col', 'css_row', 'css_col', 'css_ri', 'S_sum', 'S_mean', 'S_max',
       'synergy_zip', 'synergy_loewe', 'synergy_hsa', 'synergy_bliss',
       'drug_row_clinical_phase', 'drug_col_clinical_phase',
       'drug_row_target_name', 'drug_col_target_name', 'drug_name_x',
       'smiles_x', 'MW_x', 'LogP_x', 'MolFinger_x', 'NumHDonors_x',
       'NumHAcceptors_x', 'drug_name_y', 'smiles_y', 'MW_y', 'LogP_y',
       'MolFinger_y', 'NumHDonors_y', 'NumHAcceptors_y', 'similarity'],
      dtype='object')

In [44]:
columns_to_drop = ['tissue_name','study_name','conc_row_unit', 'conc_col_unit','drug_row_target_name', 'drug_col_target_name','drug_name_x',
                  'drug_name_y','smiles_x','smiles_y','synergy_zip','synergy_hsa', 'synergy_bliss', 'ri_row',
       'ri_col', 'css_row', 'css_col', 'css_ri', 'S_sum', 'S_mean', 'S_max']

df_fin = df_fin.drop(labels=columns_to_drop, axis=1)

df_fin.head(5)


Unnamed: 0,drug_row,drug_col,cell_line_name,ic50_row,ic50_col,synergy_loewe,drug_row_clinical_phase,drug_col_clinical_phase,MW_x,LogP_x,MolFinger_x,NumHDonors_x,NumHAcceptors_x,MW_y,LogP_y,MolFinger_y,NumHDonors_y,NumHAcceptors_y,similarity
0,NCGC00090797-11,Carfilzomib,PANC-1,0.5,0.003405,-0.000388,0,4,780.949,2.2181,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,6.0,14.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.27
1,Bardoxolone methyl,Carfilzomib,PANC-1,20.0,0.003405,-0.000327,3,4,505.699,6.37898,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,5.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.272
2,LLL-12,Carfilzomib,PANC-1,50.0,0.003405,-0.000314,0,4,879.405,17.4251,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,6.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.133
3,Navitoclax,Carfilzomib,PANC-1,20.0,0.003405,-0.000327,2,4,974.634,8.8332,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,2.0,11.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.383
4,Bardoxolone methyl,NCGC00090797-11,PANC-1,20.0,0.003405,-0.000327,3,0,505.699,6.37898,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,5.0,780.949,2.2181,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,6.0,14.0,0.374


In [45]:
#Applying label encoding to drugs to convert into strings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
df_fin[['drug_row','drug_col','cell_line_name']] = df_fin[['drug_row','drug_col','cell_line_name']].apply(LabelEncoder().fit_transform)

In [46]:
df_fin.to_csv('main_data.csv', index=False)

In [47]:
df_fin.head(2)

Unnamed: 0,drug_row,drug_col,cell_line_name,ic50_row,ic50_col,synergy_loewe,drug_row_clinical_phase,drug_col_clinical_phase,MW_x,LogP_x,MolFinger_x,NumHDonors_x,NumHAcceptors_x,MW_y,LogP_y,MolFinger_y,NumHDonors_y,NumHAcceptors_y,similarity
0,2,1,0,0.5,0.003405,-0.000388,0,4,780.949,2.2181,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,6.0,14.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.27
1,0,1,0,20.0,0.003405,-0.000327,3,4,505.699,6.37898,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,5.0,719.924,2.5835,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4.0,8.0,0.272
