In [4]:
# synpuff.ipynb content
import pandas as pd
pd.set_option("display.max_rows", None, "mode.chained_assignment", None)
from itertools import chain

import numpy as np
import datetime as dt

from dash import Dash, html, dcc, Input, Output, State, callback, callback_context
import plotly.express as px
import plotly
import plotly.offline
import plotly.graph_objs as go

'''Georgie's code'''
# copy the data to your drive and then modify this path as required

folder = 'synpuff/'

# base query for generating the cohort
person = pd.read_csv('../synpuff/person.csv')
condition_occurrence = pd.read_csv('../synpuff/condition_occurrence.csv')
drug_exposure = pd.read_csv('../synpuff/drug_exposure.csv')
concept = pd.read_csv('../synpuff/concept.csv')
hierarchy = pd.read_csv('../synpuff/hierarchy.csv')
props = pd.read_csv('../synpuff/hemonc_component_properties.csv')

neoplasm_codes = [44832128,44834489,44834490,44819488,44826452,44825256,
                  44836837,
                  44822871, 44825197, 44820602, 44835672,
                  44829798, 44828730]
condition_occurrence=condition_occurrence.loc[condition_occurrence['condition_source_concept_id'].isin(neoplasm_codes)]

'''Ivy's code'''
#rxnorm = props[props['vocabulary_id']=='RxNorm']
#list of valid drug categories from Ivy from RxNorm/HemOnc
sact=['Alkylating agent', 'Anti-CD38 antibody', 'Anti-CTLA-4 antibody', 'Anti-TACSTD2 antibody-drug conjugate', 'Anthracycline', 'Antiandrogen', 'Antifolate',
'Antimetabolite', 'Antitumor antibiotic', 'Anti-CD52 antibody', 'Anti-CD20 antibody', 'Anti-EGFR antibody', 'Anti-HER2 antibody', 'Anti-CD38 antibody', 'Anti-PD-1 antibody',
'Anti-PD-L1 antibody', 'Anti-RANKL antibody', 'Anti-SLAMF7 antibody','Anti-VEGF antibody', 'Aromatase inhibitor', 'Aromatase inhibitorsthird generation',
'Biosimilar', 'BRAF inhibitor', 'DNA methyltransferase inhibitor', 'Deoxycytidine analog', 'EGFR inhibitor', 'ERBB 2 inhibitor', 'Estrogen receptor inhibitor',
'Folic acid analog', 'Fluoropyrimidine', 'GnRH agonist', 'HDAC inhibitor', 'Human DNA synthesisinhibitor', 'Microtubule inhibitor', 'MTOR inhibitor',
'Nitrogen mustard', 'Nitrosourea', 'Neutral', 'PARP inhibitor', 'PARP1 inhibitor', 'PARP2 inhibitor', 'Phenothiazine', 'Platinum agent', 'Proteasome inhibitor',
'Purine analog', 'Pyrimidine analog', 'RANK ligand inhibitor', 'Selective estrogen receptor modulator', 'Somatostatin analog', 'T-cell activator',
'Targeted therapeutic', 'Taxane', 'Topoisomerase I inhibitor', 'Topoisomerase II inhibitor', 'Triazene', 'Vinca alkaloid', 'Xanthine oxidase inhibitor',
'WHO Essential Cancer Medicine']
#rxnorm = rxnorm[rxnorm['component_class_name'].isin(sact)]
props=props[props['component_class_name'].isin(sact)]
antican = props['concept_id_2']
drug_exposure=drug_exposure[drug_exposure['drug_concept_id'].isin(antican)]
#rxnorm['component_class_name'].value_counts()

# make labels from mapping concept IDs to concept labels
concept_lookup = {c.concept_id: c.concept_name for c in concept.itertuples()}

def make_labels(df):
    for c in df.columns:
        if 'concept_id' in c:
            df[c.replace('_id', '_label')] = df[c].map(concept_lookup)
        if 'concept_id' in c or 'source' in c or len(df[df[c].notna()])==0:
            df = df.drop(c, axis=1)
    return df

condition_occurrence_labelled = make_labels(condition_occurrence)
drug_exposure_labelled = make_labels(drug_exposure)
person_labelled = make_labels(person)

'''Applying extra filters to drug df'''
drug_exposure_labelled['drug_exposure_year'] = pd.to_datetime(drug_exposure_labelled['drug_exposure_start_date'], format='%Y-%m-%d').dt.year
exclusions = ['dexamethasone', 'filgrastim', 'epoetin alfa', 'methylprednisolone', 'hydrocortisone', 'octreotide']
drug_exposure_labelled=drug_exposure_labelled[~drug_exposure_labelled['drug_concept_label'].isin(exclusions)]

'''Data Linkage'''
person_labelled_small= person_labelled.loc[:,['person_id', 'year_of_birth', 'gender_concept_label']]
drug_persons = pd.merge(drug_exposure_labelled, person_labelled_small, on='person_id', how='left')
drug_persons['age_at_treatment'] = drug_persons['drug_exposure_year'] - drug_persons['year_of_birth']
#condition linkage
condition_labelled_small= condition_occurrence_labelled.loc[:,['person_id', 'condition_concept_label']]
condition_labelled_small['occ_number'] = 'cond_' + (condition_labelled_small.groupby('person_id').cumcount()).astype(str) 
cond_pivot = condition_labelled_small.pivot(index='person_id', columns='occ_number', values='condition_concept_label').reset_index()
drug_persons = pd.merge(drug_persons, cond_pivot, on='person_id', how='left')
'''Reshaping dataframe'''
#reduce DF down to relevant variables for the visualization
small = drug_persons[['person_id', 'drug_exposure_start_date', 'drug_concept_label', 'drug_exposure_year', 'gender_concept_label', 'age_at_treatment', 'cond_0', 'cond_1', 'cond_2', 'cond_3']]
#small = pd.merge(small, cond_pivot, on='person_id', how='left')
#small = small.dropna()
small = small.drop_duplicates()
small_sorted = small.sort_values('drug_concept_label')
#small['drug_concept_label'] = small_sorted.groupby(['person_id', 'drug_exposure_start_date'])['drug_concept_label'].transform(lambda x : ' & '.join(x))
#small.head()
small_nodup = small_sorted.drop_duplicates()
#small_nodup['drug_concept_label']=small_nodup['drug_concept_label'].str.replace('& ', '&<br>')

# add new variable for every new drug administration per person
readministrations = pd.Series(np.zeros(len(small_nodup),dtype=int),index=small_nodup.index)

# Loop through all unique ids                                                                                                                                                                                      
all_id = small_nodup['person_id'].unique()
id_administrations = {}
for pid in all_id:
    # These are all the times a patient with a given ID has had surgery                                                                                                                                            
    patient = small_nodup.loc[small_nodup['person_id']==pid]
    administrations_sorted = pd.to_datetime(patient['drug_exposure_start_date'], format='%Y-%m-%d').sort_values()

# This checks if the previous surgery was longer than 180 days ago                                                                                                                                              
    frequency = administrations_sorted.diff()<dt.timedelta(days=6000)

    # Compute the readmission                                                                                                                                                                                      
    n_administrations = [0]
    for v in frequency.values[1:]:
       n_administrations.append((n_administrations[-1]+1)*v)

    # Add these value to the time series                                                                                                                                                                           
    readministrations.loc[administrations_sorted.index] = n_administrations

small_nodup['readministration'] = readministrations
small_nodup['drug_concept_label'] = small_nodup['drug_concept_label'] + (small_nodup['readministration'].apply(lambda x: x*' '))

#pivot the DF from long to wide
pivoted = small_nodup.pivot(index='person_id', columns='readministration', values='drug_concept_label').reset_index()
# add the prefix 'drug' to every instance
prefixed = pivoted.add_prefix('drug')
#remove the word 'drug' from other variables
df = prefixed.rename(columns={"drugperson_id": "person_id", "readministration":"index"})

In [5]:
df['drug0'].value_counts()

drug0
cyclophosphamide    287
doxorubicin          79
paclitaxel           74
leuprolide           47
azacitidine          33
triptorelin          14
methotrexate          4
Name: count, dtype: int64