In [1]:
# SIDER
# Want to select as best as possible, all the SEs that are from clinical trials (as opposed to post-marketing)
# Cannot simply exclude all those SEs marked as pm because e.g. nausea (epi/doxorubucin) is observed in clin trials and pm, and would then be excluded
# 1. If the label has no pm SEs at all > assume all are clinical trial
# 2. If the label has some pm > check which ones are 'unique' to pm, i.e. exclude those pairs which also have a frequency
# because if they have a frequency this is only derived from clinical trials

# use mappings to molregno/chembl_id I did earlier
# save file with clinical trial pairs > use for doing target - adverse event analysis 
# save file with postmarketing pairs > use for doing target -adverse event analysis 


In [1]:
import pandas as pd

In [2]:
sider_dir = '/scratch/ias41/ae_code/sider'

In [3]:
# Load all reported side effects
all_se = pd.read_csv(sider_dir + '/data/meddra_all_se.tsv', sep='\t', header=None)
all_se.columns = ['stitch_flat', 'stitch_stereo', 'umls_concept_label', 'meddra_concept_type','umls_concept', 'side_effect_name']
all_se_pt = all_se.loc[all_se['meddra_concept_type']=='PT',:]

In [4]:
# First, need to exclude some rows from the frequency file which have been reported as wrong (meddra_freq_to_exclude.tsv) from GitHub page
# https://github.com/mkuhn/sider

In [5]:
to_exclude = pd.read_csv(sider_dir + '/data/meddra_freq_to_exclude_tab.tsv', sep='\t',header=None)
to_exclude.columns = ['stitch_flat', 'stitch_stereo', 'umls_concept_label', 'placebo','frequency_description', 'lower_bound', 'upper_bound', 'meddra_concept_type', 'umls_concept', 'side_effect_name']

In [6]:
to_exclude

Unnamed: 0,stitch_flat,stitch_stereo,umls_concept_label,placebo,frequency_description,lower_bound,upper_bound,meddra_concept_type,umls_concept,side_effect_name
0,CID100004594,CID009579578,C0014869,,44%,0.44,0.44,LLT,C0014869,Reflux esophagitis
1,CID100004594,CID009579578,C0017168,,56%,0.56,0.56,LLT,C0017168,Gastrooesophageal reflux disease
2,CID100004594,CID009579578,C0017168,,56%,0.56,0.56,PT,C0017168,Gastrooesophageal reflux disease
3,CID100002786,CID000029029,C0702166,,100%,1.0,1.0,LLT,C0702166,Acne
4,CID100002786,CID000029029,C0702166,,100%,1.0,1.0,PT,C0702166,Acne
5,CID100002786,CID000029029,C0702166,,34%,0.34,0.34,LLT,C0702166,Acne
6,CID100002786,CID000029029,C0702166,,34%,0.34,0.34,PT,C0702166,Acne
7,CID100002786,CID000029029,C0702166,,38%,0.38,0.38,LLT,C0702166,Acne
8,CID100002786,CID000029029,C0702166,,38%,0.38,0.38,PT,C0702166,Acne
9,CID100002786,CID000029029,C0702166,,87%,0.87,0.87,LLT,C0702166,Acne


In [7]:
# Load side effects from post-marketing or with frequency
freq_se = pd.read_csv(sider_dir + '/data/meddra_freq.tsv', sep='\t', header=None)
freq_se.columns = ['stitch_flat', 'stitch_stereo', 'umls_concept_label', 'placebo','frequency_description', 'lower_bound', 'upper_bound', 'meddra_concept_type', 'umls_concept', 'side_effect_name']
freq_se_pt = freq_se.loc[freq_se['meddra_concept_type']=='PT',:]

In [8]:
# first concatenate the 'to exclude' rows, then exclude duplicates, to exclude the rows that are wrong
to_exclude_concat = pd.concat([freq_se, to_exclude])
freq_se_corrected = to_exclude_concat.drop_duplicates(keep=False)

In [9]:
freq_se_corrected_pt = freq_se_corrected.loc[freq_se_corrected['meddra_concept_type']=='PT',:]

In [10]:
# Make selections of postmarketing and freq_rest pairs
pm = freq_se_corrected_pt.loc[freq_se_corrected_pt['frequency_description']=='postmarketing',:]
freq_rest = freq_se_corrected_pt.loc[freq_se_corrected_pt['frequency_description']!='postmarketing',:]

In [11]:
len(pm.drop_duplicates())

21985

In [12]:
#All of SIDER
len(all_se_pt[['stitch_stereo', 'side_effect_name']].drop_duplicates())

152759

In [13]:
# % post-marketing
len(pm[['stitch_stereo', 'side_effect_name']].drop_duplicates())

21209

In [14]:
21209/152759

0.1388396101048056

In [15]:
len(all_se_pt[['stitch_stereo']].drop_duplicates())

1556

In [21]:
# Next 
# 1. If the label has no pm SEs at all > assume all are clinical trial

In [22]:
# Find the drugs that do not have any postmarketing events
pm_drugs = set(pm['stitch_stereo'])

# This dataframe contains all drugs+SEs that we will accept as clinical trial because they have no postmarketing SEs
no_pm_selection = all_se_pt.loc[~all_se_pt['stitch_stereo'].isin(pm_drugs),:]

In [23]:
# check some counts
len(pm_drugs)

628

In [24]:
len(set(no_pm_selection['stitch_stereo']))

928

In [27]:
len(pm_drugs) + len(set(no_pm_selection['stitch_stereo']))

1556

In [28]:
len(set(all_se_pt['stitch_stereo']))

1556

928 drugs (out of total 1556 in all_se SIDER) did not have any postmarketing effects, so they were assumed as clinical trials

In [29]:
# 2. Find post-marketing events, but exclude those which also have frequencies

In [30]:
# compound-se pairs with frequencies
freq_pairs = set([(item[1]['stitch_stereo'],item[1]['side_effect_name']) for item in freq_rest.iterrows()])

In [31]:
# Divide the rows from pm into those with are 'unique' to PM and those which also have a frequency

pm_unique = []
pm_and_freq = []

for item in pm.iterrows():
    if (item[1]['stitch_stereo'],item[1]['side_effect_name']) in freq_pairs:
        pm_and_freq.append(item[1])
    elif (item[1]['stitch_stereo'],item[1]['side_effect_name']) not in freq_pairs:
        pm_unique.append(item[1])

In [32]:
len(pm)

21985

In [33]:
len(pm_unique) + len(pm_and_freq)

21985

In [34]:
# Of all drug-se effect pairs which were marked as pm:
# Fraction which also had frequency (assigned to clinical trial):
len(pm_and_freq) / len(pm)

0.2951103024789629

In [35]:
# This leave this fraction of drug-se pairs which were uniquely marked as pm
len(pm_unique) / len(pm)

0.704889697521037

In [36]:
pm_unique_df = pd.DataFrame(pm_unique)
pm_and_freq_df = pd.DataFrame(pm_and_freq)

In [37]:
# we will accept the drug-se pairs in pm_and_freq as also clinical trial findings
# because frequencies are normally found in clinical trials

clin_trial = pd.concat([no_pm_selection, pm_and_freq_df], sort=False)

In [38]:
# There should not be any drug-ae pairs overlapping between the clinical trial and the pm_unique ... 

In [39]:
pm_unique_pairs = set([(item[1]['stitch_stereo'],item[1]['side_effect_name']) for item in pm_unique_df.iterrows()])

In [40]:
clin_trial_pairs = set([(item[1]['stitch_stereo'],item[1]['side_effect_name']) for item in clin_trial.iterrows()])

In [41]:
len(pm_unique_pairs & clin_trial_pairs)

0

In [42]:
pm_and_freq_pairs = set([(item[1]['stitch_stereo'],item[1]['side_effect_name']) for item in pm_and_freq_df.iterrows()])

In [43]:
# How many drug-se pairs, unique drugs, and unique SEs in the two new files?
print('Post-marketing specific events:')
print('Number of unique drug-se pairs: {}'.format(len(pm_unique_pairs)))
print('Number of unique drugs: {}'.format(len(set(pm_unique_df['stitch_stereo']))))
print('Number of unique events: {}'.format(len(set(pm_unique_df['side_effect_name']))))

print('\nEvents in both postmarketing and with frequency:')
print('Number of unique drug-se pairs: {}'.format(len(pm_and_freq_pairs)))
print('Number of unique drugs: {}'.format(len(set(pm_and_freq_df['stitch_stereo']))))
print('Number of unique events: {}'.format(len(set(pm_and_freq_df['side_effect_name']))))

print('\nEvents of drugs without any postmarketing data:')
print('Number of unique drug-se pairs: {}'.format(len(set([(item[1]['stitch_stereo'],item[1]['side_effect_name']) for item in no_pm_selection.iterrows()]))))
print('Number of unique drugs: {}'.format(len(set(no_pm_selection['stitch_stereo']))))
print('Number of unique events: {}'.format(len(set(no_pm_selection['side_effect_name']))))

print('\nEvents assigned to clinical trials set:')
print('Number of unique drug-se pairs: {}'.format(len(clin_trial_pairs)))
print('Number of unique drugs: {}'.format(len(set(clin_trial['stitch_stereo']))))
print('Number of unique events: {}'.format(len(set(clin_trial['side_effect_name']))))

Post-marketing specific events:
Number of unique drug-se pairs: 15067
Number of unique drugs: 619
Number of unique events: 1899

Events in both postmarketing and with frequency:
Number of unique drug-se pairs: 6142
Number of unique drugs: 405
Number of unique events: 967

Events of drugs without any postmarketing data:
Number of unique drug-se pairs: 57764
Number of unique drugs: 928
Number of unique events: 3015

Events assigned to clinical trials set:
Number of unique drug-se pairs: 63906
Number of unique drugs: 1333
Number of unique events: 3058


In [37]:
# Attach chembl identifiers to the IDs

In [44]:
len(set(pm_unique_df['stitch_stereo']))

619

In [46]:
# Load compound mappings
parent_results = pd.read_csv(sider_dir + '/data/sider_mapped_stitch2parent.txt', sep='\t')

In [47]:
parent_results.head()

Unnamed: 0,chembl_id,molregno,parent_molregno,parent_chembl_id,stitch_stereo
0,CHEMBL1000,111185,111185,CHEMBL1000,CID000002678
1,CHEMBL1002,111482,111482,CHEMBL1002,CID000123600
2,CHEMBL1006,112480,112480,CHEMBL1006,CID000002141
3,CHEMBL1008,112651,112651,CHEMBL1008,CID000002351
4,CHEMBL1014,116349,116349,CHEMBL1014,CID000002540


In [48]:
# Merge pm-specific pairs with the compound mappings
pm_unique_df_mapped = pm_unique_df.merge(parent_results, on='stitch_stereo')

In [44]:
# Drug-event pairs assigned to postmarketing set - only those with mapped compounds to ChEMBL
#pm_unique_df_mapped.to_csv(sider_dir + '/results/sider_processed_pm-assigned_mapped.txt', sep='\t', index=None)

In [49]:
# 500 out of 619 compounds for the postmarketing-specific drug-se pairs were mapped
len(set(pm_unique_df_mapped['stitch_stereo']))

500

In [50]:
# Merge clin_trial-specific pairs with the compound mappings

In [51]:
clin_trial.head()

Unnamed: 0,stitch_flat,stitch_stereo,umls_concept_label,meddra_concept_type,umls_concept,side_effect_name,placebo,frequency_description,lower_bound,upper_bound
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,,,,
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,,,,
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,,,,
6,CID100000085,CID000010917,C0002418,PT,C0002418,Amblyopia,,,,
8,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,,,,


In [52]:
len(set(clin_trial['stitch_stereo']))

1333

In [53]:
clin_trial_mapped = clin_trial.merge(parent_results, on='stitch_stereo')

In [54]:
len(set(clin_trial_mapped['stitch_stereo']))

1041

In [55]:
# Drug-event pairs assigned to clinical trials set - only those with mapped compounds to ChEMBL
clin_trial_mapped.to_csv(sider_dir + '/results/sider_processed_clin_trial-assigned_mapped.txt', sep='\t', index=None)

In [56]:
# 1041 out of 1333 compounds for the clinical-trial assigned drug-se pairs were mapped to compounds

In [57]:
clin_trial_mapped.head()

Unnamed: 0,stitch_flat,stitch_stereo,umls_concept_label,meddra_concept_type,umls_concept,side_effect_name,placebo,frequency_description,lower_bound,upper_bound,chembl_id,molregno,parent_molregno,parent_chembl_id
0,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
1,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
2,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
3,CID100000085,CID000010917,C0002418,PT,C0002418,Amblyopia,,,,,CHEMBL1149,181939,181939,CHEMBL1149
4,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,,,,,CHEMBL1149,181939,181939,CHEMBL1149
