In [142]:
import pandas as pd
import os
from zipfile import ZipFile 
import re
import collections
import numpy as np
import xmltodict
import gzip
import pubchempy as pcp

In [143]:
ENSP_geneid_path = os.path.join('../data/raw', 'ENSP_geneid.csv')
ENSP_geneid = pd.read_csv(ENSP_geneid_path)

In [144]:
gene_mapping = {ensp:geneid for (ensp, geneid) in zip(ENSP_geneid['initial_alias'], (ENSP_geneid['name']))}

In [145]:
db_vocab = pd.read_csv('../data/raw/drugbank_vocabulary.csv')
db_vocab['Common name'] = db_vocab['Common name'].str.lower()
db_vocab['Synonyms'] = db_vocab['Synonyms'].str.lower()
db_vocab

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
0,DB00001,BIOD00024 | BTD00024,lepirudin,138068-37-8,Y43GF64R34,hirudin variant-1 | lepirudin recombinant,
1,DB00002,BIOD00071 | BTD00071,cetuximab,205923-56-4,PQX0D8J21J,cetuximab | cétuximab | cetuximabum,
2,DB00003,BIOD00001 | BTD00001,dornase alfa,143831-71-4,953A26OA1Y,deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,denileukin diftitox,173146-27-5,25E79B5CTM,denileukin | interleukin-2/diptheria toxin fus...,
4,DB00005,BIOD00052 | BTD00052,etanercept,185243-69-0,OP401G7OJC,etanercept | etanercept-szzs | etanercept-ykro...,
...,...,...,...,...,...,...,...
13558,DB15672,,rilematovir,1383450-81-4,NQ99E8OH3P,rilematovir,GTQTUABHRCWVLL-UHFFFAOYSA-N
13559,DB15673,,lenacapavir,2189684-44-2,A9A0O6FB4H,lenacapavir,BRYXUCLEHAUSDY-WEWMWRJBSA-N
13560,DB15674,,sisunatovir,1903763-82-5,KE63TTO7WK,sisunatovir,JOPCJJSYRPUEDS-UHFFFAOYSA-N
13561,DB15675,,baloxavir,1985605-59-1,4G86Y4JT3F,baloxavir,FIDLLEYNNRGVFR-CTNGQTDRSA-N


In [146]:
sider_names = pd.read_csv('../data/raw/sider_drug_names.tsv', sep='\t', header=None)
sider_names[0] = sider_names[0].str.split('CID1',expand=True)[1]
sider_names_dict = {drug_id:drug_name for (drug_id, drug_name) in zip(sider_names[0], sider_names[1])}

### get possible genes, disease drugs from other file

In [92]:
all_genes = pd.read_csv('../data/clean/genes.nodes',header=None)

In [96]:
all_genes[0]

0             1
1            12
2            17
3            1A
4             4
          ...  
17317      ZXDC
17318    ZYG11A
17319    ZYG11B
17320       ZYX
17321     ZZEF1
Name: 0, Length: 17322, dtype: object

In [7]:
all_disease = pd.read_csv('../data/clean/diseases.nodes',header=None)
all_disease.head()

Unnamed: 0,0
0,C0000737
1,C0000810
2,C0000880
3,C0001126
4,C0001144


In [8]:
all_drugs = pd.read_csv('../data/clean/drugs.nodes',header=None)
all_drugs

Unnamed: 0,0
0,1
1,137
2,174
3,176
4,204
...,...
2518,139031013
2519,139266768
2520,139595263
2521,145712321


### Disgenet

In [147]:
disgenet_path = os.path.join('../data/raw', 'curated_gene_disease_associations.tsv.gz')
with gzip.open(disgenet_path) as file:
    disgenet = pd.read_csv(file, sep='\t', engine='python')

In [148]:
disgenet = disgenet[['geneSymbol', 'diseaseId', 'diseaseSemanticType']]

In [149]:
update_node_file(disgenet['geneSymbol'].tolist(), node_file='../data/clean/genes.nodes')
update_node_file(disgenet['diseaseId'].tolist(), node_file='../data/clean/diseases.nodes')

In [150]:
# # filter genes
# disgenet = disgenet[(disgenet['geneSymbol'].isin(all_genes[0]))]

# # filter disease
# disgenet = disgenet[disgenet['diseaseId'].isin(all_disease[0])]

In [151]:
disgenet

Unnamed: 0,geneSymbol,diseaseId,diseaseSemanticType
0,A1BG,C0019209,Finding
1,A1BG,C0036341,Mental or Behavioral Dysfunction
2,A2M,C0002395,Disease or Syndrome
3,A2M,C0007102,Neoplastic Process
4,A2M,C0009375,Neoplastic Process
...,...,...,...
81741,HBB-LCR,C0002875,Disease or Syndrome
81742,HBB-LCR,C0005283,Disease or Syndrome
81743,HBB-LCR,C0019025,Disease or Syndrome
81744,HBB-LCR,C0085578,Disease or Syndrome


In [152]:
disgenet[['geneSymbol', 'diseaseId']].to_csv('../data/clean/gene-disease-association.edges',header=None,index=False)

After filtering, only 57526 overlap with STITCH and drugcentral

### Use Side effect data from Decagon

In [153]:
decagon_path = os.path.join('../data/raw', 'bio-decagon-combo.tar.gz')
with gzip.open(decagon_path) as file:
    decagon = pd.read_csv(file)
decagon.columns = ['drug1','drug2','se_id', 'se']

In [154]:
decagon = decagon.dropna()
duplicate = decagon[['drug1', 'drug2']]
decagon = duplicate.drop_duplicates()

In [155]:
decagon['drug1'] = decagon['drug1'].str.split('CID0',expand=True)[1]
decagon['drug2'] = decagon['drug2'].str.split('CID0',expand=True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [298]:
# decagon['drug1'] = decagon['drug1'].map(sider_names_dict)
# decagon['drug2'] = decagon['drug2'].map(sider_names_dict)

In [156]:
decagon['drug1'] = decagon['drug1'].map(lambda x: int(x))
decagon['drug2'] = decagon['drug2'].map(lambda x: int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [157]:
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pd.read_table(url)

In [158]:
drugbank_vocab_dict = {pubid:drugbank for (pubid,drugbank) in 
                       zip(drugbank_map_df['pubchem_id'], drugbank_map_df['drugbank_id'])}

map to Drugbank id

In [159]:
decagon['drug1'] = decagon['drug1'].map(drugbank_vocab_dict)
decagon['drug2'] = decagon['drug2'].map(drugbank_vocab_dict)
# decagon_final = decagon.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [160]:
decagon = decagon.dropna()

In [161]:
decagon_final = decagon.drop_duplicates()

In [163]:
decagon_final

Unnamed: 0,drug1,drug2
0,DB00415,DB00813
56,DB01236,DB01223
68,DB00601,DB00303
92,DB08298,DB00811
280,DB01173,DB00675
...,...,...
4649065,DB00175,DB00225
4649085,DB00343,DB01039
4649252,DB00490,DB00796
4649335,DB00762,DB01020


In [164]:
update_node_file(decagon_final['drug1'].tolist(), node_file='../data/clean/drugs.nodes')
update_node_file(decagon_final['drug2'].tolist(), node_file='../data/clean/drugs.nodes')

In [162]:
decagon_final.to_csv('../data/clean/drug-drug-side_effects.edges',header=None, index=False)

### StringDB

In [165]:
stringdb_path = os.path.join('../data/raw', '9606.protein.actions.v11.0.txt.gz')
with gzip.open(stringdb_path) as file:
    stringdb_actions = pd.read_csv(file, sep='\s', engine='python')

In [166]:
stringdb_actions

Unnamed: 0,item_id_a,item_id_b,mode,action,is_directional,a_is_acting,score
0,9606.ENSP00000000233,9606.ENSP00000216366,binding,,f,f,165
1,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,f,f,165
2,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,f,165
3,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,t,165
4,9606.ENSP00000000233,9606.ENSP00000222547,binding,,f,f,913
...,...,...,...,...,...,...,...
3470901,9606.ENSP00000485678,9606.ENSP00000409581,inhibition,inhibition,f,f,600
3470902,9606.ENSP00000485678,9606.ENSP00000409581,ptmod,,f,f,600
3470903,9606.ENSP00000485678,9606.ENSP00000438346,activation,activation,t,f,900
3470904,9606.ENSP00000485678,9606.ENSP00000481878,activation,activation,f,f,600


In [167]:
pd.unique(stringdb_actions['mode'])

array(['binding', 'reaction', 'catalysis', 'activation', 'inhibition',
       'ptmod', 'expression'], dtype=object)

In [168]:
stringdb_actions = stringdb_actions[['item_id_a', 'item_id_b', 'mode']]
stringdb_actions.columns = ['gene1','gene2','mode']

In [169]:
stringdb_actions['gene1'] = stringdb_actions['gene1'].str.split('.',expand=True)[1]
stringdb_actions['gene2'] = stringdb_actions['gene2'].str.split('.',expand=True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [170]:
stringdb_actions['gene1'] = stringdb_actions['gene1'].map(gene_mapping)
stringdb_actions['gene2'] = stringdb_actions['gene2'].map(gene_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [171]:
stringdb_actions = stringdb_actions[~(stringdb_actions['gene1'] == 'None')]
stringdb_actions = stringdb_actions[~(stringdb_actions['gene2'] == 'None')]

In [172]:
temp = stringdb_actions[['gene1', 'gene2']]
index = temp.drop_duplicates().index

In [173]:
stringdb_actions = stringdb_actions.loc[index]

In [174]:
## add new nodes to genes.nodes
update_node_file(stringdb_actions['gene1'].tolist(), node_file='../data/clean/genes.nodes')
update_node_file(stringdb_actions['gene2'].tolist(), node_file='../data/clean/genes.nodes')

In [175]:
stringdb_actions.to_csv('../data/clean/gene-gene-association.edges',header=None, index=False)


### check gene nodes and add them if neccessary

In [1]:
def update_node_file(new_nodes, node_file='../data/clean/diseases.nodes'):
    # grab the current existing nodes from the file
    if os.path.getsize(node_file) > 0:
        curr_nodes = pd.read_table(node_file, header=None).iloc[:,0].tolist()
    else:
        curr_nodes = []
    # add the new nodes, remove duplicates, and sort
    curr_nodes = curr_nodes + new_nodes
    curr_nodes = list(set(curr_nodes))
    curr_nodes.sort()
    # update the file
    curr_nodes = pd.DataFrame({ 'nodes': curr_nodes })
    curr_nodes.to_csv(node_file, header=False, index=False)

['ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 'ARF5',
 