In [1]:
import pandas as pd
import math
import numpy as np
import os
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
%matplotlib inline

In [2]:
data_matador_df = pd.read_csv('../../datasets/matador_crossed.csv')

In [3]:
data_matador_df.head()

Unnamed: 0,complexity,drug_name,drug_pubchem_id,exact_mass,hba,hbd,heavy_atoms,interaction_type,molecular_formula,mw_freebase,mw_monoisotopic,protein_chembl_id,protein_name,protein_sequence,protein_string_id,protein_uniprot_id,sequence_length,smiles,target_type,tpsa
0,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL4929;,APCS,MNKPLLWISVLTSLLEAFAHTDLSGKVFVFPRESVTDHVNLITPLE...,9606.ENSP00000255040,P02743,223,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
1,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,9606.ENSP00000273550,P02794,183,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
2,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL2364709;,FGG,MSWSLHPRNLILYFYALLFLSSTCVAYVATRDNCCILDERFGSYCP...,9606.ENSP00000336829,P02679,453,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
3,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,SERPINA1,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,9606.ENSP00000348068,P01009,418,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
4,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,LCN2,MPLGLLWLGLALLGALHAQAQDSTSDLIPAPPLSKVPLQQNFQDNQ...,9606.ENSP00000277480,P80188,198,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3


In [4]:
dti = data_matador_df.set_index(['protein_string_id'])
proteins_df = data_matador_df[['protein_string_id', 'protein_sequence']].copy()
proteins_df.drop_duplicates(subset=['protein_string_id'], inplace=True)
proteins_df_indexed = proteins_df.set_index(['protein_string_id'])
drugs_df = data_matador_df[['drug_pubchem_id', 'smiles']].copy()
drugs_df.drop_duplicates(subset=['drug_pubchem_id'], inplace=True)
drugs_df_indexed = drugs_df.set_index(['drug_pubchem_id'])

proteins = proteins_df['protein_string_id'].unique()
drugs = drugs_df['drug_pubchem_id'].unique()

num_drugs = len(drugs)
num_proteins = len(proteins)

drug2Index = {}
index2Drug = {}

cptDrug = 0
for entry in drugs:
    drug2Index[entry] = cptDrug
    index2Drug[cptDrug] = entry
    cptDrug += 1

protein2Index = {}
index2Protein = {}

cptProtein = 0
for entry in proteins:
    protein2Index[entry] = cptProtein
    index2Protein[cptProtein] = entry
    cptProtein += 1

## Calculation of the alignment score based on Smith Waterman Score

In [6]:
sim_protein_protein = np.zeros((num_proteins, num_proteins))

for p1 in proteins_df_indexed.index:
    index1 = protein2Index[p1]
    seq1 = proteins_df_indexed.loc[p1, 'protein_sequence']
    for p2 in proteins_df_indexed.index:
        index2 = protein2Index[p2]
        if(sim_protein_protein[index1][index2] > 0.):
            continue
        seq2 = proteins_df_indexed.loc[p2, 'protein_sequence']
        alignments = pairwise2.align.globalxx(seq1, seq2)
        simMin = alignments[0][2]
        sim_protein_protein[index1][index2] = simMin
        sim_protein_protein[index2][index1] = simMin
    print("Protein: {0} with other proteins".format(index1))

Protein: 0 with other proteins
Protein: 1 with other proteins
Protein: 2 with other proteins
Protein: 3 with other proteins
Protein: 4 with other proteins
Protein: 5 with other proteins
Protein: 6 with other proteins
Protein: 7 with other proteins
Protein: 8 with other proteins
Protein: 9 with other proteins
Protein: 10 with other proteins
Protein: 11 with other proteins
Protein: 12 with other proteins
Protein: 13 with other proteins
Protein: 14 with other proteins
Protein: 15 with other proteins
Protein: 16 with other proteins
Protein: 17 with other proteins
Protein: 18 with other proteins
Protein: 19 with other proteins
Protein: 20 with other proteins
Protein: 21 with other proteins
Protein: 22 with other proteins
Protein: 23 with other proteins
Protein: 24 with other proteins
Protein: 25 with other proteins
Protein: 26 with other proteins
Protein: 27 with other proteins
Protein: 28 with other proteins
Protein: 29 with other proteins
Protein: 30 with other proteins
Protein: 31 with o

Protein: 252 with other proteins
Protein: 253 with other proteins
Protein: 254 with other proteins
Protein: 255 with other proteins
Protein: 256 with other proteins
Protein: 257 with other proteins
Protein: 258 with other proteins
Protein: 259 with other proteins
Protein: 260 with other proteins
Protein: 261 with other proteins
Protein: 262 with other proteins
Protein: 263 with other proteins
Protein: 264 with other proteins
Protein: 265 with other proteins
Protein: 266 with other proteins
Protein: 267 with other proteins
Protein: 268 with other proteins
Protein: 269 with other proteins
Protein: 270 with other proteins
Protein: 271 with other proteins
Protein: 272 with other proteins
Protein: 273 with other proteins
Protein: 274 with other proteins
Protein: 275 with other proteins
Protein: 276 with other proteins
Protein: 277 with other proteins
Protein: 278 with other proteins
Protein: 279 with other proteins
Protein: 280 with other proteins
Protein: 281 with other proteins
Protein: 2

Protein: 501 with other proteins
Protein: 502 with other proteins
Protein: 503 with other proteins
Protein: 504 with other proteins
Protein: 505 with other proteins
Protein: 506 with other proteins
Protein: 507 with other proteins
Protein: 508 with other proteins
Protein: 509 with other proteins
Protein: 510 with other proteins
Protein: 511 with other proteins
Protein: 512 with other proteins
Protein: 513 with other proteins
Protein: 514 with other proteins
Protein: 515 with other proteins
Protein: 516 with other proteins
Protein: 517 with other proteins
Protein: 518 with other proteins
Protein: 519 with other proteins
Protein: 520 with other proteins
Protein: 521 with other proteins
Protein: 522 with other proteins
Protein: 523 with other proteins
Protein: 524 with other proteins
Protein: 525 with other proteins
Protein: 526 with other proteins
Protein: 527 with other proteins
Protein: 528 with other proteins
Protein: 529 with other proteins
Protein: 530 with other proteins
Protein: 5

Protein: 750 with other proteins
Protein: 751 with other proteins
Protein: 752 with other proteins
Protein: 753 with other proteins
Protein: 754 with other proteins
Protein: 755 with other proteins
Protein: 756 with other proteins
Protein: 757 with other proteins
Protein: 758 with other proteins
Protein: 759 with other proteins
Protein: 760 with other proteins
Protein: 761 with other proteins
Protein: 762 with other proteins
Protein: 763 with other proteins
Protein: 764 with other proteins
Protein: 765 with other proteins
Protein: 766 with other proteins
Protein: 767 with other proteins
Protein: 768 with other proteins
Protein: 769 with other proteins
Protein: 770 with other proteins
Protein: 771 with other proteins
Protein: 772 with other proteins
Protein: 773 with other proteins
Protein: 774 with other proteins
Protein: 775 with other proteins
Protein: 776 with other proteins
Protein: 777 with other proteins
Protein: 778 with other proteins
Protein: 779 with other proteins
Protein: 7

Protein: 999 with other proteins
Protein: 1000 with other proteins
Protein: 1001 with other proteins
Protein: 1002 with other proteins
Protein: 1003 with other proteins
Protein: 1004 with other proteins
Protein: 1005 with other proteins
Protein: 1006 with other proteins
Protein: 1007 with other proteins
Protein: 1008 with other proteins
Protein: 1009 with other proteins
Protein: 1010 with other proteins
Protein: 1011 with other proteins
Protein: 1012 with other proteins
Protein: 1013 with other proteins
Protein: 1014 with other proteins
Protein: 1015 with other proteins
Protein: 1016 with other proteins
Protein: 1017 with other proteins
Protein: 1018 with other proteins
Protein: 1019 with other proteins
Protein: 1020 with other proteins
Protein: 1021 with other proteins
Protein: 1022 with other proteins
Protein: 1023 with other proteins
Protein: 1024 with other proteins
Protein: 1025 with other proteins
Protein: 1026 with other proteins
Protein: 1027 with other proteins
Protein: 1028 w

Protein: 1240 with other proteins
Protein: 1241 with other proteins
Protein: 1242 with other proteins
Protein: 1243 with other proteins
Protein: 1244 with other proteins
Protein: 1245 with other proteins
Protein: 1246 with other proteins
Protein: 1247 with other proteins
Protein: 1248 with other proteins
Protein: 1249 with other proteins
Protein: 1250 with other proteins
Protein: 1251 with other proteins
Protein: 1252 with other proteins
Protein: 1253 with other proteins
Protein: 1254 with other proteins
Protein: 1255 with other proteins
Protein: 1256 with other proteins
Protein: 1257 with other proteins
Protein: 1258 with other proteins
Protein: 1259 with other proteins
Protein: 1260 with other proteins
Protein: 1261 with other proteins
Protein: 1262 with other proteins
Protein: 1263 with other proteins
Protein: 1264 with other proteins
Protein: 1265 with other proteins
Protein: 1266 with other proteins
Protein: 1267 with other proteins
Protein: 1268 with other proteins
Protein: 1269 

## Normalization of the Smith Waterman Score

In [7]:
for p1 in range(num_proteins):
    simMax = sim_protein_protein[p1][p1]
    for p2 in range(num_proteins):
        sim_protein_protein[p1][p2] = sim_protein_protein[p1][p2] / simMax

## We save the normalized similarity score in a file

In [8]:
np.save('sim_target_target_normalized.npy', sim_protein_protein)

In [9]:
sim_protein_protein

array([[ 1.        ,  0.31390135,  0.51121076, ...,  0.57399103,
         0.56502242,  0.33632287],
       [ 0.38251366,  1.        ,  0.54098361, ...,  0.59016393,
         0.59016393,  0.39344262],
       [ 0.25165563,  0.21854305,  1.        , ...,  0.39735099,
         0.40838852,  0.24503311],
       ..., 
       [ 0.21548822,  0.18181818,  0.3030303 , ...,  1.        ,
         0.36195286,  0.2037037 ],
       [ 0.20930233,  0.17940199,  0.30730897, ...,  0.35714286,
         1.        ,  0.18604651],
       [ 0.36945813,  0.3546798 ,  0.54679803, ...,  0.59605911,
         0.55172414,  1.        ]])

## We save the drug-drug similarity as Numpy array

In [10]:
drug_drug_sim_df = pd.read_csv('../../datasets/drug_drug_smiles_similarity.csv', names = ['drug1', 'drug1_smiles', 'drug2', 'drug2_smiles', 'similarity'])

In [11]:
def fix_id(identifier):
    return str(identifier)

In [12]:
drug_drug_sim_df['drug1'] = drug_drug_sim_df['drug1'].apply(fix_id)
drug_drug_sim_df['drug2'] = drug_drug_sim_df['drug2'].apply(fix_id)

In [13]:
drug_drug_sim_df.head()

Unnamed: 0,drug1,drug1_smiles,drug2,drug2_smiles,similarity
0,11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],1.0
1,11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],9574101,CCOC(=O)CNC(C1CCCCC1)C(=O)N2CCC2C(=O)NCC3=CC=C...,0.088083
2,11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],9567831,C1=CC(=C(C(=C1)Cl)C=NN=C(N)NO)Cl,0.081081
3,11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],6335486,C(C(=O)O)C(CC(=O)[O-])(C(=O)O)O.C(C(=O)O)C(CC(...,0.459459
4,11954225,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],5473386,C1=NC(=S)C2C(=NC=N2)N1,0.058333


In [14]:
sim_drug_drug = np.zeros((num_drugs, num_drugs))

In [15]:
drug_drug_sim_df_indexed = drug_drug_sim_df.set_index(['drug1', 'drug2'])

In [16]:
for d1 in range(num_drugs):
    drug1ID = str(index2Drug[d1])
    for d2 in range(num_drugs):
        drug2ID = str(index2Drug[d2])
        sim_drug_drug[d1][d2] = drug_drug_sim_df_indexed.loc[drug1ID, drug2ID]['similarity']

In [17]:
np.save('sim_drug_drug_normalized.npy', sim_drug_drug)

In [18]:
sim_drug_drug

array([[ 1.        ,  0.0880829 ,  0.08108108, ...,  0.10869565,
         0.27272728,  0.08547009],
       [ 0.0880829 ,  1.        ,  0.11737089, ...,  0.07731958,
         0.08695652,  0.22619048],
       [ 0.08108108,  0.11737089,  1.        , ...,  0.14492753,
         0.09375   ,  0.11023622],
       ..., 
       [ 0.10869565,  0.07731958,  0.14492753, ...,  1.        ,
         0.17142858,  0.08583691],
       [ 0.27272728,  0.08695652,  0.09375   , ...,  0.17142858,
         1.        ,  0.06086956],
       [ 0.08547009,  0.22619048,  0.11023622, ...,  0.08583691,
         0.06086956,  1.        ]])