In [1]:
# Import libraries

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 1000)

In [3]:
data_dir = 'data//'


In [4]:
# load SE pairs

meddra_all_se = pd.read_csv(data_dir + 'sider/meddra_all_se.tsv', delimiter='\t', header=None,
                                          names=['STITCH_FLAT','STITCH_STEREO', 'UMLS_CONCEPT_ID_LABEL','MEDDRA_CONCEPT_TYPE','UMLS_CONCEPT_ID_MEDDDRA','SE'])

In [5]:
meddra_all_se.head()

Unnamed: 0,STITCH_FLAT,STITCH_STEREO,UMLS_CONCEPT_ID_LABEL,MEDDRA_CONCEPT_TYPE,UMLS_CONCEPT_ID_MEDDDRA,SE
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [6]:
meddra_all_se.shape

(309849, 6)

In [7]:
meddra_all_se.nunique()


STITCH_FLAT                1430
STITCH_STEREO              1556
UMLS_CONCEPT_ID_LABEL      5868
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    6060
SE                         6123
dtype: int64

In [8]:
# top side effects - PT

all_se_pt = meddra_all_se[meddra_all_se['MEDDRA_CONCEPT_TYPE']=='PT']

top_se_pt = all_se_pt.groupby('SE')['SE'].count().sort_values(ascending=False)

In [9]:
top_se_pt.head()

SE
Dizziness     1758
Asthenia      1513
Dermatitis    1433
Nausea        1302
Rash          1250
Name: SE, dtype: int64

In [10]:
# top side effects - LLT

all_se_llt = meddra_all_se[meddra_all_se['MEDDRA_CONCEPT_TYPE']=='LLT']

top_se_llt = all_se_llt.groupby('SE')['SE'].count().sort_values(ascending=False)

In [11]:
top_se_llt.head()

SE
Nausea       1299
Headache     1197
Vomiting     1170
Rash         1153
Dizziness    1068
Name: SE, dtype: int64

In [12]:
# Import drug names and ATC

In [13]:
drug_names = pd.read_csv(data_dir + 'sider/drug_names.tsv', delimiter='\t', names=['CID','DRUG'])
atc = pd.read_csv(data_dir + 'sider/drug_atc.tsv', delimiter='\t', names=['CID','ATC'])

In [14]:
drug_names.head()

Unnamed: 0,CID,DRUG
0,CID100000085,carnitine
1,CID100000119,gamma-aminobutyric
2,CID100000137,5-aminolevulinic
3,CID100000143,leucovorin
4,CID100000146,5-methyltetrahydrofolate


In [15]:
drug_names.shape

(1430, 2)

In [16]:
atc.head()

Unnamed: 0,CID,ATC
0,CID100000085,A16AA01
1,CID100000119,L03AA03
2,CID100000119,N03AG03
3,CID100000137,L01XD04
4,CID100000143,V03AF03


In [17]:
atc.shape

(1560, 2)

In [18]:
drug_atc = drug_names.merge(atc, how='left')
drug_atc.head()

Unnamed: 0,CID,DRUG,ATC
0,CID100000085,carnitine,A16AA01
1,CID100000119,gamma-aminobutyric,L03AA03
2,CID100000119,gamma-aminobutyric,N03AG03
3,CID100000137,5-aminolevulinic,L01XD04
4,CID100000143,leucovorin,V03AF03


In [19]:
drug_atc.nunique()

CID     1430
DRUG    1347
ATC     1560
dtype: int64

In [20]:
# drugs with more than one ATC

drug_atc.groupby('DRUG')['ATC'].count().sort_values(ascending=False)

DRUG
dexamethasone    22
Insulin          13
glucose          12
neomycin         12
sodium           11
                 ..
edrophonium       0
spinosad          0
pemirolast        0
ecallantide       0
phenylbutyric     0
Name: ATC, Length: 1347, dtype: int64

In [21]:
# create df with drug, atc, and r

In [22]:
sider = drug_atc.merge(meddra_all_se, how='right', left_on='CID', right_on='STITCH_FLAT')

In [23]:
sider.head()

Unnamed: 0,CID,DRUG,ATC,STITCH_FLAT,STITCH_STEREO,UMLS_CONCEPT_ID_LABEL,MEDDRA_CONCEPT_TYPE,UMLS_CONCEPT_ID_MEDDDRA,SE
0,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [24]:
sider.shape

(426780, 9)

In [25]:
meddra_all_se.shape

(309849, 6)

In [26]:
sider.nunique()

CID                        1430
DRUG                       1347
ATC                        1560
STITCH_FLAT                1430
STITCH_STEREO              1556
UMLS_CONCEPT_ID_LABEL      5868
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    6060
SE                         6123
dtype: int64

In [27]:
sider.to_csv(data_dir + 'sider.csv')

In [28]:
# check which meddra term is used in CVADR

CVADR_reactions = pd.read_csv('data/CVADR/reactions.txt', 
                            delimiter='$',
                            header = None,
                            names = ['REACTION_ID','REPORT_ID','DURATION','DURATION_UNIT_ENG','DURATION_UNIT_FR','PT_NAME_ENG','PT_NAME_FR','SOC_NAME_ENG','SOC_NAME_FR','MEDDRA_VERSION'],
                            dtype = {'REACTION_ID':'object','REPORT_ID':'object'})


  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
CVADR_reactions.PT_NAME_ENG.nunique()

11993

In [30]:
CVADR_reactions.head(3)

Unnamed: 0,REACTION_ID,REPORT_ID,DURATION,DURATION_UNIT_ENG,DURATION_UNIT_FR,PT_NAME_ENG,PT_NAME_FR,SOC_NAME_ENG,SOC_NAME_FR,MEDDRA_VERSION
0,13501,135,,,,Dysuria,Dysurie,Renal and urinary disorders,Affections du rein et des voies urinaires,v.22.1
1,13502,135,,,,Nocturia,Nycturie,Renal and urinary disorders,Affections du rein et des voies urinaires,v.22.1
2,13601,136,,,,Nausea,Nausée,Gastrointestinal disorders,Affections gastro-intestinales,v.22.1


In [31]:

print('CVADR reactions not in SIDER PT: ' + str(CVADR_reactions.PT_NAME_ENG[~CVADR_reactions.PT_NAME_ENG.isin(all_se_pt.SE)].nunique()))
print('CVADR reactions not in SIDER PT or LLT: ' + str(CVADR_reactions.PT_NAME_ENG[~CVADR_reactions.PT_NAME_ENG.isin(meddra_all_se.SE)].nunique()))
print('CVADR reactions in SIDER PT or LLT: ' + str(CVADR_reactions.PT_NAME_ENG[CVADR_reactions.PT_NAME_ENG.isin(meddra_all_se.SE)].nunique()))

CVADR reactions not in SIDER PT: 8126
CVADR reactions not in SIDER PT or LLT: 8089
CVADR reactions in SIDER PT or LLT: 3904


In [32]:
CVADR_reactions.PT_NAME_ENG[~CVADR_reactions.PT_NAME_ENG.isin(meddra_all_se.SE)].unique()

array(['Pyrexia', 'Normocytic anaemia', 'Lacrimation disorder', ...,
       'Sexual inhibition', 'Congenital heart valve incompetence',
       'Ultrasound foetal abnormal'], dtype=object)

In [33]:
# filter reaction by SIDER SEs

In [34]:
CVADR_se_fltrd = CVADR_reactions[CVADR_reactions.PT_NAME_ENG.isin(meddra_all_se.SE)]

In [35]:
CVADR_se_fltrd.head()

Unnamed: 0,REACTION_ID,REPORT_ID,DURATION,DURATION_UNIT_ENG,DURATION_UNIT_FR,PT_NAME_ENG,PT_NAME_FR,SOC_NAME_ENG,SOC_NAME_FR,MEDDRA_VERSION
0,13501,135,,,,Dysuria,Dysurie,Renal and urinary disorders,Affections du rein et des voies urinaires,v.22.1
1,13502,135,,,,Nocturia,Nycturie,Renal and urinary disorders,Affections du rein et des voies urinaires,v.22.1
2,13601,136,,,,Nausea,Nausée,Gastrointestinal disorders,Affections gastro-intestinales,v.22.1
3,13701,137,,,,Headache,Céphalée,Nervous system disorders,Affections du système nerveux,v.22.1
4,13801,138,,,,Pruritus,Prurit,Skin and subcutaneous tissue disorders,Affections de la peau et du tissu sous-cutané,v.22.1


In [36]:
reactions_matched = pd.DataFrame(CVADR_se_fltrd.nunique(), columns=['fltrd'])

reactions_matched['all'] = CVADR_reactions.nunique()
reactions_matched['pct_fltrd'] = reactions_matched['fltrd']/reactions_matched['all']

In [37]:
reactions_matched

Unnamed: 0,fltrd,all,pct_fltrd
REACTION_ID,2254455,2606282,0.865008
REPORT_ID,713703,750583,0.950865
DURATION,1431,1593,0.898305
DURATION_UNIT_ENG,9,9,1.0
DURATION_UNIT_FR,9,9,1.0
PT_NAME_ENG,3904,11993,0.325523
PT_NAME_FR,3904,11993,0.325523
SOC_NAME_ENG,27,27,1.0
SOC_NAME_FR,27,27,1.0
MEDDRA_VERSION,1,1,1.0


In [38]:
# 96% of reports and 89% of reactions are still present. OK

In [39]:
# check how many SIDER drugs are in DrugBank

In [40]:
db_products = pd.read_csv(data_dir + 'drugbank/db/drugs.csv')
db_syn = pd.read_csv(data_dir + 'drugbank/db/drug_syn.csv')

In [41]:
db_products.head(1)

Unnamed: 0,primary_key,other_keys,type,created,updated,name,description,cas_number,unii,average_mass,monoisotopic_mass,state,groups_count,articles_count,books_count,links_count,synthesis_reference,indication,pharmacodynamics,mechanism_of_action,metabolism,absorption,half_life,protein_binding,route_of_elimination,volume_of_distribution,clearance,international_brands,pdb_entries,fda_label,msds,food_interactions,drug_interactions_count,toxicity
0,DB00001,"BTD00024,BIOD00024",biotech,2005-06-13,2020-01-02,Lepirudin,Lepirudin is identical to natural hirudin exce...,138068-37-8,Y43GF64R34,,,liquid,1,3,0,1,,For the treatment of heparin-induced thrombocy...,Lepirudin is used to break up clots and to red...,Lepirudin forms a stable non-covalent complex ...,Lepirudin is thought to be metabolized by rele...,Bioavailability is 100% following injection.,Approximately 1.3 hours,,Lepirudin is thought to be metabolized by rele...,"* 12.2 L [Healthy young subjects (n = 18, age ...",* 164 ml/min [Healthy 18-60 yrs]\r\n* 139 ml/m...,,0,//s3-us-west-2.amazonaws.com/drugbank/fda_labe...,//s3-us-west-2.amazonaws.com/drugbank/msds/DB0...,0,638,"In case of overdose (eg, suggested by excessiv..."


In [42]:
db_syn.head(1)

Unnamed: 0,parent_key,synonym,language,coder
0,DB00001,Hirudin variant-1,english,


In [43]:
sider[(sider['DRUG'].str.upper().isin(db_products['name'].str.upper())) | (sider['DRUG'].str.upper().isin(db_syn['synonym'].str.upper()))].nunique()

CID                        1189
DRUG                       1132
ATC                        1376
STITCH_FLAT                1189
STITCH_STEREO              1301
UMLS_CONCEPT_ID_LABEL      5651
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    5842
SE                         5903
dtype: int64

In [44]:
# of drug-se pairs matched
sider[((sider['DRUG'].str.upper().isin(db_products['name'].str.upper())) | (sider['DRUG'].str.upper().isin(db_syn['synonym'].str.upper()))) & (sider['MEDDRA_CONCEPT_TYPE']=='PT')].count()

CID                        195491
DRUG                       195491
ATC                        181012
STITCH_FLAT                195491
STITCH_STEREO              195491
UMLS_CONCEPT_ID_LABEL      195491
MEDDRA_CONCEPT_TYPE        195491
UMLS_CONCEPT_ID_MEDDDRA    195491
SE                         195491
dtype: int64

In [45]:
sider.nunique()

CID                        1430
DRUG                       1347
ATC                        1560
STITCH_FLAT                1430
STITCH_STEREO              1556
UMLS_CONCEPT_ID_LABEL      5868
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    6060
SE                         6123
dtype: int64

In [46]:
sider[sider['MEDDRA_CONCEPT_TYPE']=='PT'].count()

CID                        224733
DRUG                       224733
ATC                        202736
STITCH_FLAT                224733
STITCH_STEREO              224733
UMLS_CONCEPT_ID_LABEL      224733
MEDDRA_CONCEPT_TYPE        224733
UMLS_CONCEPT_ID_MEDDDRA    224733
SE                         224733
dtype: int64

In [47]:
db_ex_id = pd.read_csv(data_dir + 'drugbank/db/drug_external_identifiers.csv')
db_ex_id.head(1)

Unnamed: 0,resource,identifier,parent_key
0,Drugs Product Database (DPD),11916,DB00001


In [48]:
db_ex_id.dtypes

resource      object
identifier    object
parent_key    object
dtype: object

In [49]:
db_pubchem = db_ex_id[db_ex_id['resource']=='PubChem Compound']


In [50]:
db_pubchem[db_pubchem['identifier']=='171306834']

Unnamed: 0,resource,identifier,parent_key


In [51]:
sider['CID'].str.replace("CID","").astype('int').astype('str')

0         100000085
1         100000085
2         100000085
3         100000085
4         100000085
            ...    
426775    171306834
426776    171306834
426777    171306834
426778    171306834
426779    171306834
Name: CID, Length: 426780, dtype: object

In [52]:
sider[sider['STITCH_STEREO'].str.replace("CID","").astype('int').astype('str').isin(db_pubchem['identifier'])].nunique()

CID                         980
DRUG                        963
ATC                        1172
STITCH_FLAT                 980
STITCH_STEREO               998
UMLS_CONCEPT_ID_LABEL      5285
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    5474
SE                         5526
dtype: int64

In [53]:
# for now, use the name matching between SIDER and DrugBank
# 1132/1347 of drugs names are matched (84%)
# can work on improving this later

## Generate the testing datasets

For A side effects in D drugs, total number of side effect-drug pairs is A x D
Total pairs for drugs matched by name to DrugBank:
1132 x 5903 = 6,682,196


Pairs in positive dataset: 195,491


Positive/negative datasets should look like:
drug | cid | atc | se_name | se_meddra | related

Negative datset:
For each drug, combine with all se.
Randomly sample 200,000 from dataset. 
Do this three times. 

In [54]:
sider.head()

Unnamed: 0,CID,DRUG,ATC,STITCH_FLAT,STITCH_STEREO,UMLS_CONCEPT_ID_LABEL,MEDDRA_CONCEPT_TYPE,UMLS_CONCEPT_ID_MEDDDRA,SE
0,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [55]:
# postive datset 
positives = sider[['DRUG', 'ATC', 'SE', 'UMLS_CONCEPT_ID_MEDDDRA']]
positives.to_csv(data_dir + 'sider_positives.csv')

In [56]:
# negative datset

# create unique lists of drugs and se
drug_list = sider['CID'].unique()
se_list = sider['UMLS_CONCEPT_ID_MEDDDRA'].unique()

# combine a drug with all se
df1 = pd.DataFrame(se_list, columns=['se_meddra'])
df1['index'] = 1


# just do cross product on the index. 

In [57]:
df1

Unnamed: 0,se_meddra,index
0,C0000729,1
1,C0000737,1
2,C0687713,1
3,C0002418,1
4,C0002871,1
...,...,...
6056,C0235802,1
6057,C0240846,1
6058,C0271036,1
6059,C0856117,1
