In [33]:
import pandas as pd
import pickle
import collections
import datetime

In [1]:
sider_dir = '/scratch/ias41/ae_code/sider'

In [9]:
report_information = collections.OrderedDict()

In [4]:
# Drug-event pairs assigned to clinical trials set - only those with mapped compounds to ChEMBL
clin_trial_mapped = pd.read_csv(sider_dir + '/results/sider_processed_clin_trial-assigned_mapped.txt', sep='\t')

In [5]:
clin_trial_mapped.head()

Unnamed: 0,stitch_flat,stitch_stereo,umls_concept_label,meddra_concept_type,umls_concept,side_effect_name,placebo,frequency_description,lower_bound,upper_bound,chembl_id,molregno,parent_molregno,parent_chembl_id
0,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
1,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
2,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,,,,,CHEMBL1149,181939,181939,CHEMBL1149
3,CID100000085,CID000010917,C0002418,PT,C0002418,Amblyopia,,,,,CHEMBL1149,181939,181939,CHEMBL1149
4,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,,,,,CHEMBL1149,181939,181939,CHEMBL1149


In [16]:
# Get some basic counts for the report

# Nr of stereo compounds in sider all_se with Meddra PTs
# Load all reported side effects
all_se = pd.read_csv(sider_dir + '/data/meddra_all_se.tsv', sep='\t', header=None)
all_se.columns = ['stitch_flat', 'stitch_stereo', 'umls_concept_label', 'meddra_concept_type','umls_concept', 'side_effect_name']
all_se_pt = all_se.loc[all_se['meddra_concept_type']=='PT',:]

report_information['Datapoints in all SIDER all_se (before mapping)'] = '{} unique stereo_stich ids, {} unique AEs (PTs)'.format(len(all_se_pt['stitch_stereo'].drop_duplicates()), len(all_se_pt['side_effect_name'].drop_duplicates()))

# Nr of stereo compounds in sider_all_se mapped to molregnos

all_se_pt_mapped = pd.read_csv(sider_dir + '/results/sider_all_se_pt_mapped.txt', sep='\t')

report_information['Datapoints in all SIDER all_se (mapped)'] = '{} unique stereo_stich ids, {} unique AEs (PTs)'.format(len(all_se_pt_mapped['STITCH_stereo'].drop_duplicates()), len(all_se_pt_mapped['side_effect_name'].drop_duplicates()))

# Nr of mapped stereo ids mapped assigned to clinical set

report_information['Datapoints in clinical-assigned SIDER (mapped)'] = '{} unique stereo_stich ids, {} unique AEs (PTs)'.format(len(clin_trial_mapped['stitch_stereo'].drop_duplicates()), len(clin_trial_mapped['side_effect_name'].drop_duplicates()))


In [17]:
report_information

OrderedDict([('Datapoints in all SIDER all_se (before mapping)',
              '1556 unique stereo_stich ids, 4251 unique AEs (PTs)'),
             ('Datapoints in all SIDER all_se (mapped)',
              '1219 unique stereo_stich ids, 4054 unique AEs (PTs)'),
             ('Datapoints in clinical-assigned SIDER (mapped)',
              '1041 unique stereo_stich ids, 2858 unique AEs (PTs)')])

In [18]:
# Find side effects with less than five drugs
ses_with_less_than_five_drugs = list()
for group in clin_trial_mapped.groupby(by='side_effect_name'):
    if len(group[1]['parent_molregno'].drop_duplicates()) < 5:
        ses_with_less_than_five_drugs.append(group[0])

In [22]:
clin_trial_mapped.loc[clin_trial_mapped['side_effect_name']=='Abdominal hernia',:]

Unnamed: 0,stitch_flat,stitch_stereo,umls_concept_label,meddra_concept_type,umls_concept,side_effect_name,placebo,frequency_description,lower_bound,upper_bound,chembl_id,molregno,parent_molregno,parent_chembl_id
41897,CID100216235,CID000216235,C0178282,PT,C0178282,Abdominal hernia,,,,,CHEMBL282724,23417,23417,CHEMBL282724


In [23]:
len(ses_with_less_than_five_drugs)

1727

In [24]:
report_information['Number of side effects (from clinical mapped) excluded because they had fewer than 5 drugs'] = len(ses_with_less_than_five_drugs)

In [25]:
# Exclude SEs with less than five drugs
sider_data_selected = clin_trial_mapped.loc[~clin_trial_mapped['side_effect_name'].isin(ses_with_less_than_five_drugs),:]

In [28]:
report_information['Datapoints in clinical-assigned SIDER (mapped) after excluding side effects without 5 drugs (used to make ae pickle)'] = '{} unique stereo_stich ids, {} unique AEs (PTs)'.format(len(sider_data_selected['stitch_stereo'].drop_duplicates()), len(sider_data_selected['side_effect_name'].drop_duplicates()))

In [29]:
report_information

OrderedDict([('Datapoints in all SIDER all_se (before mapping)',
              '1556 unique stereo_stich ids, 4251 unique AEs (PTs)'),
             ('Datapoints in all SIDER all_se (mapped)',
              '1219 unique stereo_stich ids, 4054 unique AEs (PTs)'),
             ('Datapoints in clinical-assigned SIDER (mapped)',
              '1041 unique stereo_stich ids, 2858 unique AEs (PTs)'),
             ('Number of side effects (from clinical mapped) excluded because they had fewer than 5 drugs',
              1727),
             ('Datapoints in clinical-assigned SIDER (mapped) after excluding side effects without 5 drugs (used to make ae pickle)',
              '1040 unique stereo_stich ids, 1131 unique AEs (PTs)')])

In [26]:
molregno2aes_sider = dict()

for group in sider_data_selected.groupby(by='parent_molregno'):
    molregno = group[0]
    aes = list(group[1]['side_effect_name'].drop_duplicates())
    
    molregno2aes_sider[molregno] = set()
    for ae in aes:
        molregno2aes_sider[molregno].add(ae.upper())

In [34]:
current_date = datetime.date.today().strftime("%Y%m%d")
with open(sider_dir + '/results/{}_molregno2aes_sider_min5drugs.pkl'.format(current_date), 'wb') as f:
    pickle.dump(molregno2aes_sider, f)

In [37]:
# Write report with counts
with open(sider_dir + '/results/report_counts_sider_clinical_set.txt', 'w') as f:
    for k, v in report_information.items():
        f.write(k + ': ' + str(v) + '\n')