# SIDER Processing 

**Author:** Laetitia Tam

**Last Modified**: 2020-03-06

In this notebook, files from the SIDER dataset are imported and merged. Preliminary data cleaning is performed. 

In [11]:
# Import libraries

import pandas as pd
import numpy as np

In [12]:
pd.set_option('display.max_columns', 1000)

In [13]:
data_dir = '../data//'

In [14]:
# load all drug-side effect pairs

meddra_all_se = pd.read_csv(data_dir + 'sider/meddra_all_se.tsv', delimiter='\t', header=None,
                                          names=['STITCH_FLAT','STITCH_STEREO', 'UMLS_CONCEPT_ID_LABEL','MEDDRA_CONCEPT_TYPE','UMLS_CONCEPT_ID_MEDDDRA','SE'])

In [15]:
meddra_all_se.head()

Unnamed: 0,STITCH_FLAT,STITCH_STEREO,UMLS_CONCEPT_ID_LABEL,MEDDRA_CONCEPT_TYPE,UMLS_CONCEPT_ID_MEDDDRA,SE
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [16]:
meddra_all_se.shape

(309849, 6)

In [17]:
meddra_all_se.nunique()

STITCH_FLAT                1430
STITCH_STEREO              1556
UMLS_CONCEPT_ID_LABEL      5868
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    6060
SE                         6123
dtype: int64

In [18]:
# have a look at the top side effects. 
# Filter for PT (MedDRA Preferred Terms)

all_se_pt = meddra_all_se[meddra_all_se['MEDDRA_CONCEPT_TYPE']=='PT']

top_se_pt = all_se_pt.groupby('SE')['SE'].count().sort_values(ascending=False)

top_se_pt.head()

SE
Dizziness     1758
Asthenia      1513
Dermatitis    1433
Nausea        1302
Rash          1250
Name: SE, dtype: int64

In [19]:
# Check the LLT (lowest level terms) for comparison

all_se_llt = meddra_all_se[meddra_all_se['MEDDRA_CONCEPT_TYPE']=='LLT']
top_se_llt = all_se_llt.groupby('SE')['SE'].count().sort_values(ascending=False)
top_se_llt.head()

SE
Nausea       1299
Headache     1197
Vomiting     1170
Rash         1153
Dizziness    1068
Name: SE, dtype: int64

In [20]:
# Import drug names and ATC code files for linking with the side effect file

drug_names = pd.read_csv(data_dir + 'sider/drug_names.tsv', delimiter='\t', names=['CID','DRUG'])
atc = pd.read_csv(data_dir + 'sider/drug_atc.tsv', delimiter='\t', names=['CID','ATC'])

In [21]:
drug_names.head()

Unnamed: 0,CID,DRUG
0,CID100000085,carnitine
1,CID100000119,gamma-aminobutyric
2,CID100000137,5-aminolevulinic
3,CID100000143,leucovorin
4,CID100000146,5-methyltetrahydrofolate


In [22]:
drug_names.shape

(1430, 2)

In [23]:
atc.head()

Unnamed: 0,CID,ATC
0,CID100000085,A16AA01
1,CID100000119,L03AA03
2,CID100000119,N03AG03
3,CID100000137,L01XD04
4,CID100000143,V03AF03


In [24]:
atc.shape

(1560, 2)

In [25]:
# merge the atc codes to the drugs
drug_atc = drug_names.merge(atc, how='left')
drug_atc.head()

Unnamed: 0,CID,DRUG,ATC
0,CID100000085,carnitine,A16AA01
1,CID100000119,gamma-aminobutyric,L03AA03
2,CID100000119,gamma-aminobutyric,N03AG03
3,CID100000137,5-aminolevulinic,L01XD04
4,CID100000143,leucovorin,V03AF03


In [26]:
drug_atc.nunique()

CID     1430
DRUG    1347
ATC     1560
dtype: int64

In [27]:
# drugs with more than one ATC

drug_atc.groupby('DRUG')['ATC'].count().sort_values(ascending=False)

DRUG
dexamethasone    22
Insulin          13
glucose          12
neomycin         12
sodium           11
                 ..
edrophonium       0
spinosad          0
pemirolast        0
ecallantide       0
phenylbutyric     0
Name: ATC, Length: 1347, dtype: int64

In [28]:
# create merge all dataframes together

sider = drug_atc.merge(meddra_all_se, how='right', left_on='CID', right_on='STITCH_FLAT')

In [29]:
sider.head()

Unnamed: 0,CID,DRUG,ATC,STITCH_FLAT,STITCH_STEREO,UMLS_CONCEPT_ID_LABEL,MEDDRA_CONCEPT_TYPE,UMLS_CONCEPT_ID_MEDDDRA,SE
0,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,carnitine,A16AA01,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [30]:
sider.shape

(426780, 9)

In [31]:
meddra_all_se.shape

(309849, 6)

In [32]:
sider.nunique()

CID                        1430
DRUG                       1347
ATC                        1560
STITCH_FLAT                1430
STITCH_STEREO              1556
UMLS_CONCEPT_ID_LABEL      5868
MEDDRA_CONCEPT_TYPE           2
UMLS_CONCEPT_ID_MEDDDRA    6060
SE                         6123
dtype: int64

## Match to CVADR Reactions

In [33]:
# SIDER and CVADR both use MedDRA (PT) Preferred Terms for side effects, so they can be linked.
# Filter the SIDER dataset to only the PTs available in CVADR. 

CVADR_reactions = pd.read_csv(data_dir + 'CVADR/reactions.txt', 
                            delimiter='$',
                              
                            header = None,
                            names = ['REACTION_ID','REPORT_ID','DURATION','DURATION_UNIT_ENG','DURATION_UNIT_FR','PT_NAME_ENG','PT_NAME_FR','SOC_NAME_ENG','SOC_NAME_FR','MEDDRA_VERSION'],
                            dtype = {'REACTION_ID':'object','REPORT_ID':'object'})


In [34]:
CVADR_se_fltrd = CVADR_reactions[CVADR_reactions.PT_NAME_ENG.isin(meddra_all_se.SE)]

In [35]:
reactions_matched = pd.DataFrame(CVADR_se_fltrd.nunique(), columns=['fltrd'])

reactions_matched['all'] = CVADR_reactions.nunique()
reactions_matched['pct_fltrd'] = reactions_matched['fltrd']/reactions_matched['all']

In [36]:
# Over > 65% of the PTs in CVADR do not appear in SIDER. 
# This is unexpected because CVADR and SIDER claim to use MedDRA PTs version 15 and 16.1, respectively, 
# so they should be reasonably consistent. 
# Most likely, the CVADR data is not particularly clean. 
# Still, 96% of reports and 89% of reactions are present after filtering.

reactions_matched

Unnamed: 0,fltrd,all,pct_fltrd
REACTION_ID,2254455,2606282,0.865008
REPORT_ID,713703,750583,0.950865
DURATION,1431,1593,0.898305
DURATION_UNIT_ENG,9,9,1.0
DURATION_UNIT_FR,9,9,1.0
PT_NAME_ENG,3904,11993,0.325523
PT_NAME_FR,3904,11993,0.325523
SOC_NAME_ENG,27,27,1.0
SOC_NAME_FR,27,27,1.0
MEDDRA_VERSION,1,1,1.0


In [37]:
# Export the data for further processing

sider_positives = sider[['DRUG', 'ATC', 'SE', 'UMLS_CONCEPT_ID_MEDDDRA']]
sider_positives.to_csv(data_dir + 'sider_positives.csv')