In [1]:
import pandas as pd
import numpy as np

In [51]:
# import data 
sider = pd.read_csv("~/Desktop/Harvard/Thesis/Data/SIDER/meddra_all_se.tsv", sep='\t', header= None, 
                           names = ['stitch_id', 'stitch_id_stereochem', 'umls_id_label', 'meddra_concept_type', 
                                    'umls_id_meddra', 'side_effect'])

# make new column with pubmed ID 
# stitchID with CID0 has stereochemistry, use that to determine pubchem_id
sider['pubchem_id'] = sider['stitch_id_stereochem'].str.split('ID0').str.get(1)

In [52]:
sider.head()

Unnamed: 0,stitch_id,stitch_id_stereochem,umls_id_label,meddra_concept_type,umls_id_meddra,side_effect,pubchem_id
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,10917
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,10917
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,10917
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,10917
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,10917


# Output file with drug-se pairs from SIDER

In [46]:
sider = sider.drop(['stitch_id', 'stitch_id_stereochem', 'umls_id_label', 'meddra_concept_type', 
                                    'umls_id_meddra'], axis=1)

In [47]:
sider.head()

Unnamed: 0,side_effect,pubchem_id
0,Abdominal cramps,10917
1,Abdominal pain,10917
2,Abdominal pain,10917
3,Gastrointestinal pain,10917
4,Abdominal pain,10917


In [48]:
pubchemID = [int(pc) for pc in sider.pubchem_id]
sider['pubchem_ID'] = pubchemID
sider = sider.drop(['pubchem_id'], axis = 1)

In [49]:
sider.head()

Unnamed: 0,side_effect,pubchem_ID
0,Abdominal cramps,10917
1,Abdominal pain,10917
2,Abdominal pain,10917
3,Gastrointestinal pain,10917
4,Abdominal pain,10917


In [50]:
sider = sider.drop_duplicates()

In [51]:
sider.head(10)

Unnamed: 0,side_effect,pubchem_ID
0,Abdominal cramps,10917
1,Abdominal pain,10917
3,Gastrointestinal pain,10917
5,Amblyopia,10917
7,Anaemia,10917
9,Anorexia,10917
10,Decreased appetite,10917
11,Anxiety,10917
13,Arrhythmia,10917
15,Asthenia,10917


In [52]:
sider.columns.tolist()

['side_effect', 'pubchem_ID']

In [53]:
sider = sider[['pubchem_ID','side_effect' ]]

In [54]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
0,10917,Abdominal cramps
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
5,10917,Amblyopia
7,10917,Anaemia


In [55]:
#sider.to_csv("~/Desktop/Harvard/Thesis/Data/SIDER/pubchem_se_sider.txt", sep = '\t', index = False)

# Output file with drug-se pairs from SIDER using Preferred-Term Pairs only 

In [53]:
sider = sider.drop(['stitch_id', 'stitch_id_stereochem', 'umls_id_label', 
                                    'umls_id_meddra'], axis=1)

In [54]:
pubchemID = [int(pc) for pc in sider.pubchem_id]
sider['pubchem_ID'] = pubchemID
sider = sider.drop(['pubchem_id'], axis = 1)

In [55]:
sider.head()

Unnamed: 0,meddra_concept_type,side_effect,pubchem_ID
0,LLT,Abdominal cramps,10917
1,PT,Abdominal pain,10917
2,LLT,Abdominal pain,10917
3,PT,Gastrointestinal pain,10917
4,PT,Abdominal pain,10917


In [56]:
len(sider)

309849

In [57]:
## remove all rows for which the concept type is not equal to PT 
sider = sider[sider['meddra_concept_type'] =='PT']

In [58]:
sider.head()

Unnamed: 0,meddra_concept_type,side_effect,pubchem_ID
1,PT,Abdominal pain,10917
3,PT,Gastrointestinal pain,10917
4,PT,Abdominal pain,10917
6,PT,Amblyopia,10917
8,PT,Anaemia,10917


In [59]:
len(sider)

163206

In [60]:
sider = sider.drop(['meddra_concept_type'], axis = 1)

In [61]:
sider = sider[['pubchem_ID','side_effect' ]]

In [62]:
sider = sider.drop_duplicates()

In [63]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
6,10917,Amblyopia
8,10917,Anaemia
10,10917,Decreased appetite


In [68]:
#sider.to_csv("~/Desktop/Harvard/Thesis/Data/SIDER/pubchem_sePT_sider.txt", sep = '\t', index = False)

# Output file with drug-se pairs SIDER using DrugBank overlap terms only

In [35]:
# import data 
sider_db_overlap = pd.read_csv("~/Desktop/Harvard/Thesis/Data/SIDER/smiles/sider_drugbank_smiles_overlap.txt", 
                    sep='\t')

In [71]:
sider = sider.drop(['stitch_id', 'stitch_id_stereochem', 'umls_id_label', 'meddra_concept_type', 
                                    'umls_id_meddra'], axis=1)

In [17]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
6,10917,Amblyopia
8,10917,Anaemia
10,10917,Decreased appetite


In [36]:
sider_db_overlap.head()

Unnamed: 0,pubchem_id,SMILES[1],SMILES[2],Canonical_Smiles,DrugID,Name,Status,Type
0,10917,C[N+](C)(C)C[C@@H](CC(=O)[O-])O,C[N+](C)(C)C[C@H](O)CC([O-])=O,C[N+](C)(C)C[C@H](O)CC(=O)[O-],DB00583,L-Carnitine,approved,small molecule
1,119,C(CC(=O)O)CN,NCCCC(O)=O,NCCCC(=O)O,DB02530,Gamma(Amino)-Butyric Acid,experimental,small molecule
2,137,C(CC(=O)O)C(=O)CN,NCC(=O)CCC(O)=O,NCC(=O)CCC(=O)O,DB00855,Aminolevulinic acid,approved,small molecule
3,444412,CN1[C@H](CNC2=C1C(=O)N=C(N2)N)CNC3=CC=C(C=C3)C...,CN1[C@@H](CNC2=CC=C(C=C2)C(=O)N[C@@H](CCC(O)=O...,CN1[C@@H](CNc2ccc(cc2)C(=O)N[C@@H](CCC(=O)O)C(...,DB04789,5-methyltetrahydrofolate,experimental,small molecule
4,5280360,CCCCC[C@@H](/C=C/[C@H]1[C@@H](CC(=O)[C@@H]1C/C...,CCCCC[C@H](O)\C=C\[C@H]1[C@H](O)CC(=O)[C@@H]1C...,CCCCC[C@H](O)\C=C\[C@H]1[C@H](O)CC(=O)[C@@H]1C...,DB00917,Dinoprostone,approved,small molecule


In [74]:
pubchemID = [int(pc) for pc in sider.pubchem_id]
sider['pubchem_ID'] = pubchemID
sider = sider.drop(['pubchem_id'], axis = 1)

In [37]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
6,10917,Amblyopia
8,10917,Anaemia
10,10917,Decreased appetite


In [38]:
type(sider_db_overlap.pubchem_id[3])

numpy.int64

In [39]:
type(sider.pubchem_ID[3])

numpy.int64

In [40]:
overlap = list(sider_db_overlap.pubchem_id)

In [44]:
sider = sider[sider['pubchem_ID'].isin(overlap) == True]

In [80]:
sider = sider[['pubchem_ID','side_effect' ]]

In [42]:
sider = sider.drop_duplicates()

In [43]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
6,10917,Amblyopia
8,10917,Anaemia
10,10917,Decreased appetite


In [45]:
len(sider)

92649

In [46]:
10917 in overlap

True

In [85]:
#sider.to_csv("~/Desktop/Harvard/Thesis/Data/SIDER/pubchem_se_dbOverlap_sider.txt", sep = '\t', index = False)

In [47]:
len(sider.pubchem_ID.unique())

821

# Output file with drug-se pairs SIDER using DrugBank PT overlap terms only

In [64]:
sider.head()

Unnamed: 0,pubchem_ID,side_effect
1,10917,Abdominal pain
3,10917,Gastrointestinal pain
6,10917,Amblyopia
8,10917,Anaemia
10,10917,Decreased appetite


In [2]:
dt = pd.read_csv("~/Desktop/Harvard/Thesis/Data/SIDER/smiles/drug_target_pairings.txt", sep='\t')

In [66]:
dt.head()

Unnamed: 0,pubchem_id,uniprot_target
0,10917,Q9H015
1,10917,O76082
2,10917,P43155
3,10917,Q8N8R3
4,10917,O43772


In [67]:
sider = sider[sider['pubchem_ID'].isin(list(dt.pubchem_id.unique())) == True]

In [68]:
len(sider.pubchem_ID.unique())

755

In [4]:
sider = pd.read_csv("~/Desktop/Harvard/Thesis/Data/SIDER/pubchem_se_siderPT_DB.txt", sep = '\t')

In [6]:
len(sider.pubchem_ID.unique())

755

In [7]:
len(dt.pubchem_id.unique())

755

In [3]:
#sider.to_csv("~/Desktop/Harvard/Thesis/Data/SIDER/pubchem_se_siderPT_DB.txt", sep = '\t', index = False)

NameError: name 'sider' is not defined