In [15]:
import pandas as pd
import os
from zipfile import ZipFile 
import re
import collections
import numpy as np
import xmltodict
import gzip

In [16]:
ENSP_geneid_path = os.path.join('../data/raw', 'ENSP_geneid.csv')
ENSP_geneid = pd.read_csv(ENSP_geneid_path)

In [17]:
gene_mapping = {ensp:geneid for (ensp, geneid) in zip(ENSP_geneid['initial_alias'], (ENSP_geneid['name']))}

### get possible genes, disease drugs from other file

In [5]:
all_genes = pd.read_csv('../data/clean/genes.nodes',header=None)

In [6]:
all_genes.head()

Unnamed: 0,0
0,A1BG
1,A1CF
2,AACS
3,AADAC
4,AADACL2


In [7]:
all_disease = pd.read_csv('../data/clean/diseases.nodes',header=None)
all_disease.head()

Unnamed: 0,0
0,C0000737
1,C0000810
2,C0000880
3,C0001126
4,C0001144


In [8]:
all_drugs = pd.read_csv('../data/clean/drugs.nodes',header=None)
all_drugs

Unnamed: 0,0
0,1
1,137
2,174
3,176
4,204
...,...
2518,139031013
2519,139266768
2520,139595263
2521,145712321


### Disgenet

In [9]:
disgenet_path = os.path.join('../data/raw', 'curated_gene_disease_associations.tsv.gz')
with gzip.open(disgenet_path) as file:
    disgenet = pd.read_csv(file, sep='\t', engine='python')

In [10]:
disgenet = disgenet[['geneSymbol', 'diseaseId', 'diseaseSemanticType']]

In [11]:
# filter genes
disgenet = disgenet[(disgenet['geneSymbol'].isin(all_genes[0]))]

# filter disease
disgenet = disgenet[disgenet['diseaseId'].isin(all_disease[0])]

In [12]:
disgenet

Unnamed: 0,geneSymbol,diseaseId,diseaseSemanticType
1,A1BG,C0036341,Mental or Behavioral Dysfunction
28,NAT1,C0001973,Mental or Behavioral Dysfunction
30,NAT1,C0005684,Neoplastic Process
37,NAT1,C0033578,Neoplastic Process
41,NAT1,C0376358,Neoplastic Process
...,...,...,...
81564,OCLN,C0006142,Neoplastic Process
81567,OCLN,C0007820,Disease or Syndrome
81568,OCLN,C0019156,Disease or Syndrome
81576,OCLN,C0678222,Neoplastic Process


In [13]:
disgenet[['geneSymbol', 'diseaseId']].to_csv('../data/clean/gene-disease-association.edges',header=None,index=False)

After filtering, only 57526 overlap with STITCH and drugcentral

### SIDER

In [347]:
sider_path = os.path.join('../data/raw', 'meddra_all_se.tsv.gz')
with gzip.open(sider_path) as file:
    sider = pd.read_csv(file, sep='\t', engine='python',header=None)

In [348]:
sider = sider[[0, 1, 5]]
sider.columns = ['drug1', 'drug2', 'side_effect']

In [349]:
sider['drug1'] = pd.to_numeric(sider['drug1'].map(lambda x: x[4:]))
sider['drug2'] = pd.to_numeric(sider['drug2'].map(lambda x: x[4:]))

In [354]:
# drop duplicates
sider =sider.iloc[sider[['drug1','drug2']].drop_duplicates().index]

In [355]:
sider

Unnamed: 0,drug1,drug2,side_effect
0,85,10917,Abdominal cramps
141,119,119,Anaphylactic shock
151,137,137,Anaemia
322,143,143,Alopecia
382,143,6006,Alopecia
...,...,...,...
308303,56603655,56603655,Anaphylactic shock
308570,56842239,56842239,Anaphylactic shock
308597,70683024,70683024,Angioedema
308668,70695640,70695640,Abdominal pain


In [344]:
sider = sider[(sider['drug1'].isin(all_drugs[0]))]
sider = sider[(sider['drug2'].isin(all_drugs[0]))]

In [258]:
sider['drug1'] = sider['drug1'].apply(lambda x:int(x))
# sider['drug2'] = sider['drug2'].apply(lambda x:int(x))

In [248]:
(sider['drug1'] == sider['drug2']).sum() 

130135

In [356]:
(all_drugs[0] == 119).sum()

0

### StringDB

In [18]:
stringdb_path = os.path.join('../data/raw', '9606.protein.actions.v11.0.txt.gz')
with gzip.open(stringdb_path) as file:
    stringdb_actions = pd.read_csv(file, sep='\s', engine='python')

In [19]:
stringdb_actions

Unnamed: 0,item_id_a,item_id_b,mode,action,is_directional,a_is_acting,score
0,9606.ENSP00000000233,9606.ENSP00000216366,binding,,f,f,165
1,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,f,f,165
2,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,f,165
3,9606.ENSP00000000233,9606.ENSP00000216366,reaction,,t,t,165
4,9606.ENSP00000000233,9606.ENSP00000222547,binding,,f,f,913
...,...,...,...,...,...,...,...
3470901,9606.ENSP00000485678,9606.ENSP00000409581,inhibition,inhibition,f,f,600
3470902,9606.ENSP00000485678,9606.ENSP00000409581,ptmod,,f,f,600
3470903,9606.ENSP00000485678,9606.ENSP00000438346,activation,activation,t,f,900
3470904,9606.ENSP00000485678,9606.ENSP00000481878,activation,activation,f,f,600


In [20]:
pd.unique(stringdb_actions['mode'])

array(['binding', 'reaction', 'catalysis', 'activation', 'inhibition',
       'ptmod', 'expression'], dtype=object)

In [21]:
stringdb_actions = stringdb_actions[['item_id_a', 'item_id_b', 'mode']]
stringdb_actions.columns = ['gene1','gene2','mode']

In [22]:
stringdb_actions['gene1'] = stringdb_actions['gene1'].str.split('.',expand=True)[1]
stringdb_actions['gene2'] = stringdb_actions['gene2'].str.split('.',expand=True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
stringdb_actions['gene1'] = stringdb_actions['gene1'].map(gene_mapping)
stringdb_actions['gene2'] = stringdb_actions['gene2'].map(gene_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
# filter genes 1
stringdb_actions = stringdb_actions[(stringdb_actions['gene1'].isin(all_genes[0]))]

# filter disease
stringdb_actions = stringdb_actions[(stringdb_actions['gene2'].isin(all_genes[0]))]

In [25]:
stringdb_actions = stringdb_actions.drop_duplicates()

In [26]:
stringdb_actions.to_csv('../data/clean/gene-gene-association.edges',header=None, index=False)