In [1]:
import pandas as pd
import sqlite3
import math
from sklearn.model_selection import train_test_split
import numpy as np
import requests

# Utils: Convert ID from int to String

In [2]:
def id_to_string(int_id):
    if not math.isnan(int_id):
        return str(int(int_id))
    else:
        return int_id

# We load the Matador Dataset (direct and indirect interactions)

In [4]:
matador_data_df = pd.read_csv("../../datasets/matador_direct_indirect.csv")

In [5]:
matador_data_df.head()

Unnamed: 0,chemical_id,chemical_name,atc,protein_id,protein_name,mesh_id,uniprot_id,protein_score,protein_annotation,mesh_score,mesh_annotation,matador_score,matador_annotation
0,11954269,everolimus,L04AA18,9606.ENSP00000354587,FRAP1,,Q9Y4I3 Q96QW8_HUMAN Q96QG3 Q6LE87 Q5TER3_HUMAN...,950,DIRECT,0,,950,DIRECT
1,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000255040,APCS,D000209,P02743,0,,207,INDIRECT,207,INDIRECT
2,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000273550,FTH1,D000209,Q3SWW1 P02794 Q6NZ44_HUMAN,0,,207,INDIRECT,207,INDIRECT
3,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000336829,FGG,D000209,P04470 P04469 P02679 Q9UC63_HUMAN Q9UC62_HUMAN...,0,,207,INDIRECT,207,INDIRECT
4,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000348068,SERPINA1,D000209,P01009 Q9UCM3_HUMAN Q9UCE6_HUMAN Q9P1P0 Q96ES1...,0,,207,INDIRECT,207,INDIRECT


## We convert the chemical_id into STRING

In [7]:
matador_data_df['chemical_id'] = matador_data_df['chemical_id'].apply(id_to_string)

## We get the drugs list

In [8]:
drugs_list = matador_data_df['chemical_id'].unique()

## We define the list of columns of interest we want to get from PubChem

In [9]:
pubchem_data =  {'drug_pubchem_id': [], 
                 'molecular_formula': [],
                 'mw_freebase': [],
                 'smiles': [],
                 'exact_mass': [],
                 'mw_monoisotopic': [],
                 'tpsa': [],
                 'complexity': [],
                 'hba': [],
                 'hbd': [],
                 'heavy_atoms': []
                 }

In [10]:
i = 1
for drug_id in drugs_list:
    try:
        url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + str(drug_id) + "/property/CanonicalSMILES,MolecularFormula,MolecularWeight,XLogP,ExactMass,MonoisotopicMass,TPSA,Complexity,HBondDonorCount,HBondAcceptorCount,HeavyAtomCount/JSON"
        resp = requests.get(url=url)
        if(resp.status_code != 200):
            raise Exception("We didn't find the drug with ID: {0}".format(drug_id))
        data = resp.json()['PropertyTable']['Properties'][0]
        pubchem_data['drug_pubchem_id'].append(drug_id)
        pubchem_data['molecular_formula'].append(data['MolecularFormula'])
        pubchem_data['mw_freebase'].append(data['MolecularWeight'])
        pubchem_data['smiles'].append(data['CanonicalSMILES'])
        pubchem_data['exact_mass'].append(data['ExactMass'])
        pubchem_data['mw_monoisotopic'].append(data['MonoisotopicMass'])
        pubchem_data['tpsa'].append(data['TPSA'])
        pubchem_data['complexity'].append(data['Complexity'])
        pubchem_data['hba'].append(data['HBondDonorCount'])
        pubchem_data['hbd'].append(data['HBondAcceptorCount'])
        pubchem_data['heavy_atoms'].append(data['HeavyAtomCount'])

        print('Row: {0}'.format(i))

        i += 1
    except Exception as error:
        print(error)

Row: 1
Row: 2
Row: 3
Row: 4
Row: 5
Row: 6
Row: 7
Row: 8
Row: 9
Row: 10
Row: 11
Row: 12
Row: 13
Row: 14
Row: 15
Row: 16
Row: 17
Row: 18
Row: 19
Row: 20
Row: 21
Row: 22
Row: 23
Row: 24
Row: 25
Row: 26
Row: 27
Row: 28
Row: 29
Row: 30
Row: 31
Row: 32
Row: 33
Row: 34
Row: 35
Row: 36
Row: 37
Row: 38
Row: 39
Row: 40
Row: 41
Row: 42
Row: 43
Row: 44
Row: 45
Row: 46
Row: 47
Row: 48
Row: 49
Row: 50
Row: 51
Row: 52
Row: 53
Row: 54
Row: 55
Row: 56
Row: 57
Row: 58
Row: 59
Row: 60
Row: 61
Row: 62
Row: 63
Row: 64
Row: 65
Row: 66
Row: 67
Row: 68
Row: 69
Row: 70
Row: 71
Row: 72
Row: 73
Row: 74
Row: 75
Row: 76
Row: 77
Row: 78
Row: 79
Row: 80
Row: 81
Row: 82
Row: 83
Row: 84
Row: 85
Row: 86
Row: 87
Row: 88
Row: 89
Row: 90
Row: 91
Row: 92
Row: 93
Row: 94
Row: 95
Row: 96
Row: 97
Row: 98
Row: 99
Row: 100
Row: 101
Row: 102
Row: 103
Row: 104
Row: 105
Row: 106
Row: 107
Row: 108
Row: 109
Row: 110
Row: 111
Row: 112
Row: 113
Row: 114
Row: 115
Row: 116
Row: 117
Row: 118
Row: 119
Row: 120
Row: 121
Row: 122
Row: 123
R

In [12]:
# We save the PubChem data in a file
pubchem_df = pd.DataFrame(data=pubchem_data)
pubchem_df.to_csv('../../datasets/pubchem_data.csv', index=False)

In [23]:
uniprot_proteome_df = pd.read_csv("../../datasets/uniprot-proteome-UP000005640.tab", sep='\t')
uniprot_proteome_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Cross-reference (EMBL),Cross-reference (ChEMBL),Cross-reference (STRING),Sequence
0,L0R819,ASURF_HUMAN,reviewed,ASNSD1 upstream open reading frame protein (AS...,ASDURF,Homo sapiens (Human),96,HF547970;,,,MPSRGTRPEDSSVLIPTDNSTPHKEDLSSKIKEQKIVVDELSNLKK...
1,P98196,AT11A_HUMAN,reviewed,Probable phospholipid-transporting ATPase IH (...,ATP11A ATPIH ATPIS KIAA1021,Homo sapiens (Human),1134,AB028944;,,9606.ENSP00000283558;,MDCSLVRTLVHRYCAGEENWVDSRTIYVGHREPPPGAEAYIPQRYP...
2,P50993,AT1A2_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit a...,ATP1A2 KIAA0778,Homo sapiens (Human),1020,J05096;AB018321;CH471121;CH471121;BC052271;M16...,CHEMBL2095186;,9606.ENSP00000354490;,MGRGAGREYSPAATTAENGGGKKKQKEKELDELKKEVAMDDHKLSL...
3,Q13733,AT1A4_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit a...,ATP1A4 ATP1AL2,Homo sapiens (Human),1029,AF506797;AF421887;AF310646;AF430843;AF390027;A...,CHEMBL2095186;,9606.ENSP00000357060;,MGLWGKKGTVAPHDQSPRRRPKKGLIKKKMVKREKQKRNMEELKKE...
4,P05026,AT1B1_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit b...,ATP1B1 ATP1B,Homo sapiens (Human),303,X03747;M25160;M25161;U16799;BT009787;AL031726;...,CHEMBL2095186;,9606.ENSP00000356789;,MARGKAKEEGSWKKFIWNSEKKEFLGRTGGSWFKILLFYVIFYGCL...


### Function to fix the IDs

In [24]:
def id_fix(id_to_fix):
    if (type(id_to_fix) is str):
        return id_to_fix[:-1]
    return id_to_fix

In [25]:
uniprot_proteome_df['Cross-reference (STRING)'] = uniprot_proteome_df['Cross-reference (STRING)'].apply(id_fix)
uniprot_proteome_df.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Cross-reference (EMBL),Cross-reference (ChEMBL),Cross-reference (STRING),Sequence
0,L0R819,ASURF_HUMAN,reviewed,ASNSD1 upstream open reading frame protein (AS...,ASDURF,Homo sapiens (Human),96,HF547970;,,,MPSRGTRPEDSSVLIPTDNSTPHKEDLSSKIKEQKIVVDELSNLKK...
1,P98196,AT11A_HUMAN,reviewed,Probable phospholipid-transporting ATPase IH (...,ATP11A ATPIH ATPIS KIAA1021,Homo sapiens (Human),1134,AB028944;,,9606.ENSP00000283558,MDCSLVRTLVHRYCAGEENWVDSRTIYVGHREPPPGAEAYIPQRYP...
2,P50993,AT1A2_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit a...,ATP1A2 KIAA0778,Homo sapiens (Human),1020,J05096;AB018321;CH471121;CH471121;BC052271;M16...,CHEMBL2095186;,9606.ENSP00000354490,MGRGAGREYSPAATTAENGGGKKKQKEKELDELKKEVAMDDHKLSL...
3,Q13733,AT1A4_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit a...,ATP1A4 ATP1AL2,Homo sapiens (Human),1029,AF506797;AF421887;AF310646;AF430843;AF390027;A...,CHEMBL2095186;,9606.ENSP00000357060,MGLWGKKGTVAPHDQSPRRRPKKGLIKKKMVKREKQKRNMEELKKE...
4,P05026,AT1B1_HUMAN,reviewed,Sodium/potassium-transporting ATPase subunit b...,ATP1B1 ATP1B,Homo sapiens (Human),303,X03747;M25160;M25161;U16799;BT009787;AL031726;...,CHEMBL2095186;,9606.ENSP00000356789,MARGKAKEEGSWKKFIWNSEKKEFLGRTGGSWFKILLFYVIFYGCL...


### We build a dataframe with all the columns including drugs chemical structures and proteins sequences

In [26]:
data_formated = {'drug_pubchem_id': [],
                 'drug_name': [],
                 'molecular_formula': [],
                 'smiles': [],
                 'hba': [],
                 'hbd': [],
                 'heavy_atoms': [],
                 'mw_freebase': [],
                 'mw_monoisotopic': [],
                 'complexity': [],
                 'tpsa': [],
                 'exact_mass': [],
                 'target_type': [],
                 'protein_sequence': [],
                 'sequence_length': [],
                 'protein_string_id': [],
                 'protein_chembl_id': [],
                 'protein_uniprot_id': [],
                 'protein_name': [],
                 'interaction_type': []
                 }

In [27]:
proteome_sequences_filtered = uniprot_proteome_df.loc[uniprot_proteome_df['Cross-reference (STRING)'].notnull()]
proteome_sequences_filtered_indexed = proteome_sequences_filtered.set_index(["Cross-reference (STRING)", "Entry", "Cross-reference (ChEMBL)"])

pubchem_ids_indexed = pubchem_df.set_index(['drug_pubchem_id'])

for row_index in range(matador_data_df.shape[0]):
    drug_pubchem_id = matador_data_df.iloc[row_index]['chemical_id']
    drug_name = matador_data_df.iloc[row_index]['chemical_name']
    
    if(drug_pubchem_id not in pubchem_ids_indexed.index):
        continue
    
    
    protein_string_id = matador_data_df.iloc[row_index]['protein_id']
    
    if(protein_string_id not in proteome_sequences_filtered_indexed.index.get_level_values('Cross-reference (STRING)')):
        continue
    
    protein_uniprot_id, protein_chembl_id = proteome_sequences_filtered_indexed.loc[protein_string_id].index.values[0]
    
    protein_sequence = proteome_sequences_filtered_indexed.loc[protein_string_id, protein_uniprot_id, protein_chembl_id]['Sequence']
    sequence_length = proteome_sequences_filtered_indexed.loc[protein_string_id, protein_uniprot_id, protein_chembl_id]['Length']
    
    interaction_type = matador_data_df.iloc[row_index]['matador_annotation']
    protein_name = matador_data_df.iloc[row_index]['protein_name']
    
    mw_freebase = pubchem_ids_indexed.loc[drug_pubchem_id, 'mw_freebase']
    hba = pubchem_ids_indexed.loc[drug_pubchem_id, 'hba']
    hbd = pubchem_ids_indexed.loc[drug_pubchem_id, 'hbd']
    tpsa = pubchem_ids_indexed.loc[drug_pubchem_id, 'tpsa']
    heavy_atoms = pubchem_ids_indexed.loc[drug_pubchem_id, 'heavy_atoms']
    mw_monoisotopic = pubchem_ids_indexed.loc[drug_pubchem_id, 'mw_monoisotopic']
    exact_mass = pubchem_ids_indexed.loc[drug_pubchem_id, 'exact_mass']
    complexity = pubchem_ids_indexed.loc[drug_pubchem_id, 'complexity']
    molecular_formula = pubchem_ids_indexed.loc[drug_pubchem_id, 'molecular_formula']
    smiles = pubchem_ids_indexed.loc[drug_pubchem_id, 'smiles']

    data_formated['drug_pubchem_id'].append(drug_pubchem_id)
    data_formated['drug_name'].append(drug_name)
    data_formated['smiles'].append(smiles)
    data_formated['mw_freebase'].append(mw_freebase)
    data_formated['hba'].append(hba)
    data_formated['hbd'].append(hbd)
    data_formated['tpsa'].append(tpsa)
    data_formated['heavy_atoms'].append(heavy_atoms)
    data_formated['mw_monoisotopic'].append(mw_monoisotopic)
    data_formated['exact_mass'].append(exact_mass)
    data_formated['target_type'].append('XXXXX')
    data_formated['protein_sequence'].append(protein_sequence)
    data_formated['sequence_length'].append(sequence_length)
    data_formated['protein_string_id'].append(protein_string_id)
    data_formated['protein_chembl_id'].append(protein_chembl_id)
    data_formated['protein_uniprot_id'].append(protein_uniprot_id)
    data_formated['protein_name'].append(protein_name)
    data_formated['interaction_type'].append(interaction_type)
    data_formated['complexity'].append(complexity)
    data_formated['molecular_formula'].append(molecular_formula)

In [28]:
data_all_cols = pd.DataFrame(data=data_formated)
data_all_cols.head()

Unnamed: 0,complexity,drug_name,drug_pubchem_id,exact_mass,hba,hbd,heavy_atoms,interaction_type,molecular_formula,mw_freebase,mw_monoisotopic,protein_chembl_id,protein_name,protein_sequence,protein_string_id,protein_uniprot_id,sequence_length,smiles,target_type,tpsa
0,126,gold sodium thiomalate,11954225,367.939,1,5,11,INDIRECT,C4H4AuNaO4S,368.088,367.939,CHEMBL4929;,APCS,MNKPLLWISVLTSLLEAFAHTDLSGKVFVFPRESVTDHVNLITPLE...,9606.ENSP00000255040,P02743,223,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
1,126,gold sodium thiomalate,11954225,367.939,1,5,11,INDIRECT,C4H4AuNaO4S,368.088,367.939,,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,9606.ENSP00000273550,P02794,183,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
2,126,gold sodium thiomalate,11954225,367.939,1,5,11,INDIRECT,C4H4AuNaO4S,368.088,367.939,CHEMBL2364709;,FGG,MSWSLHPRNLILYFYALLFLSSTCVAYVATRDNCCILDERFGSYCP...,9606.ENSP00000336829,P02679,453,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
3,126,gold sodium thiomalate,11954225,367.939,1,5,11,INDIRECT,C4H4AuNaO4S,368.088,367.939,,SERPINA1,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,9606.ENSP00000348068,P01009,418,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
4,126,gold sodium thiomalate,11954225,367.939,1,5,11,INDIRECT,C4H4AuNaO4S,368.088,367.939,,LCN2,MPLGLLWLGLALLGALHAQAQDSTSDLIPAPPLSKVPLQQNFQDNQ...,9606.ENSP00000277480,P80188,198,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3


In [29]:
def fix_interaction(interaction):
    return 1 if interaction == 'DIRECT' else 2

In [30]:
data_all_cols['interaction_type'] = data_all_cols['interaction_type'].apply(fix_interaction)

In [33]:
data_all_cols.describe()

Unnamed: 0,complexity,exact_mass,hba,hbd,heavy_atoms,interaction_type,mw_freebase,mw_monoisotopic,sequence_length,tpsa
count,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0
mean,486.947213,341.025707,1.615939,4.874855,23.898159,1.415604,341.382443,341.022621,652.964594,78.489211
std,317.939326,150.608786,1.68683,2.810945,10.835984,0.492858,150.681281,150.598932,644.777257,51.97025
min,8.0,75.032,0.0,0.0,4.0,1.0,75.067,75.032,30.0,0.0
25%,280.5,238.132,1.0,3.0,17.0,1.0,238.287,238.132,379.0,45.15
50%,432.0,309.119,1.0,4.0,22.0,1.0,309.332,309.119,494.0,66.6
75%,608.0,405.186,2.0,6.0,29.0,2.0,405.966,405.186,669.0,96.2
max,3740.0,1579.666,16.0,27.0,109.0,2.0,1580.616,1579.666,5202.0,568.0


### Now we save the resulting dataset

In [34]:
data_all_cols.to_csv('../../datasets/matador_crossed.csv', index=False)