In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

# Importing rdkit to extract chemical features of the drugs
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski


In [None]:
sml = pd.read_csv('smiles.csv')
sml.head()

In [None]:
NCAT=pd.read_csv('NCAT.csv')
NCAT.head()

In [None]:
NCAT.columns

In [None]:
NCAT.drop('block_id',inplace=True,axis=1)
NCAT.drop_duplicates(inplace=True)

In [None]:
#defining and calling lipinski function to extract the molecular feautures of the smiles dataset
def lipinski(sml, verbose=False):

    mdata= []
    for elem in sml:
        mol=Chem.MolFromSmiles(elem) 
        mdata.append(mol)
       
    finalData= np.arange(1,1)
    i=0  
    for mol in mdata:        
        MolWt = Descriptors.MolWt(mol)
        MolLogP = Descriptors.MolLogP(mol)
        MolFinger=Chem.RDKFingerprint(mol)
        NumHDonors = Lipinski.NumHDonors(mol)
        NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([MolWt,
                        MolLogP,
                        MolFinger,
                        NumHDonors,
                        NumHAcceptors])   
    
        if(i==0):
            finalData=row
        else:
            finalData=np.vstack([finalData, row])
        i=i+1      
    
    columnNames=["MW","LogP","MolFinger","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=finalData,columns=columnNames)
    
    return descriptors

df_lipinski = lipinski(sml.smiles) # storing features in a dataframe

df_lipinski.head(2)

In [None]:
#Combining individual drugs and their molecular features
sml1 = pd.concat([sml, df_lipinski], axis=1) 
sml1

In [None]:
sml1.to_csv('smiles_with_mol.csv',index=False) # saving the output in a csv file for later use

sml1 = pd.read_csv('smiles_with_mol.csv')

In [None]:
#Calculating similarity scores based on the fingerprints extracted for each molecule
from rdkit import DataStructs
similarity=[]
for i in range(0,len(sml1.drug_name)):
    for j in range(0,len(sml1.drug_name)):
        if i==j:
            continue
        x1= Chem.MolFromSmiles(sml1.smiles[i])
        y1= Chem.MolFromSmiles(sml1.smiles[j])
    #print(df_final5.smiles_row[i],df_final5.smiles_col[i])
        x=Chem.RDKFingerprint(x1)
        y=Chem.RDKFingerprint(y1)
        #print(x,y)
        similarity.append(round(DataStructs.FingerprintSimilarity(x,y),3))
print(similarity)

In [None]:
#Creating a  matrix containing all drug combinations 
Matrix1=[]
s1=sml1.values.tolist()
for i in range(0,len(sml1.drug_name)):
    for j in range(0,len(sml1.drug_name)):
        if (i==j):
            continue
        Matrix1.append([sml1.drug_name[i],sml1.drug_name[j]])
print(Matrix1)

In [None]:
df_intial=pd.DataFrame(Matrix1,columns=['drug_row','drug_col'])
Matrix1

In [None]:
df_intial.drop_duplicates(inplace=True)

In [None]:
#Combining all the feautures of drug1 to final dataset
df_intial1 =  pd.merge(df_intial , sml1 ,how="left",left_on=['drug_row'],right_on=['drug_name'])

In [None]:
#Combining all the feautures of drug2 to final dataset
df_intial2 =  pd.merge(df_intial1 , sml1 ,how="left",left_on=['drug_col'],right_on=['drug_name'])

In [None]:
fingerprint=pd.DataFrame(similarity,columns=['similarity'])

df_final=pd.concat([df_intial2,fingerprint], axis=1)

In [None]:
labels = ['drug_row','drug_col']

In [None]:
df_fin =  pd.merge(NCAT , df_final ,how="left",left_on=['drug_row','drug_col'],right_on=['drug_row','drug_col'])

In [None]:
df_fin.dropna(inplace=True)

In [None]:
df_fin.columns

In [None]:
columns_to_drop = ['tissue_name','study_name','conc_row_unit', 'conc_col_unit','drug_row_target_name', 'drug_col_target_name','drug_name_x',
                  'drug_name_y','smiles_x','smiles_y','synergy_zip','synergy_hsa', 'synergy_bliss', 'ri_row',
       'ri_col', 'css_row', 'css_col', 'css_ri', 'S_sum', 'S_mean', 'S_max']

df_fin = df_fin.drop(labels=columns_to_drop, axis=1)

df_fin.head(5)


In [None]:
#Applying label encoding to drugs to convert into strings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
df_fin[['drug_row','drug_col','cell_line_name']] = df_fin[['drug_row','drug_col','cell_line_name']].apply(LabelEncoder().fit_transform)

In [None]:
df_fin.to_csv('main_data.csv', index=False)

In [None]:
df_fin.head(2)