In [None]:
# synpuff.ipynb content
import pandas as pd
pd.set_option("display.max_rows", None, "mode.chained_assignment", None)
from itertools import chain

import numpy as np
import datetime as dt

In [None]:
from dash import Dash, html, dcc, Input, Output, callback
import plotly.express as px
import plotly
import plotly.offline
import plotly.graph_objs as go

In [None]:
'''Georgie's code'''
# copy the data to your drive and then modify this path as required

folder = 'synpuff/'

# base query for generating the cohort
concept = pd.read_csv('synpuff/CONCEPT.csv')
condition_occurrence = pd.read_csv('synpuff/CONDITION_OCCURRENCE.csv')
drug_exposure = pd.read_csv('synpuff/DRUG_EXPOSURE.csv')
observation = pd.read_csv('synpuff/OBSERVATION.csv')
person = pd.read_csv('synpuff/PERSON.csv')
procedure_occurrence = pd.read_csv('synpuff/PROCEDURE_OCCURRENCE.csv')
hierarchy = pd.read_csv('synpuff/hierarchy.csv')
props = pd.read_csv('synpuff/hemonc_component_properties.csv')

In [None]:
'''Ivy's code'''
#rxnorm = props[props['vocabulary_id']=='RxNorm']
#list of valid drug categories from Ivy from RxNorm/HemOnc
sact=['Alkylating agent', 'Anti-CD38 antibody', 'Anti-CTLA-4 antibody', 'Anti-TACSTD2 antibody-drug conjugate', 'Anthracycline', 'Antiandrogen', 'Antifolate',
'Antimetabolite', 'Antitumor antibiotic', 'Anti-CD52 antibody', 'Anti-CD20 antibody', 'Anti-EGFR antibody', 'Anti-HER2 antibody', 'Anti-CD38 antibody', 'Anti-PD-1 antibody',
'Anti-PD-L1 antibody', 'Anti-RANKL antibody', 'Anti-SLAMF7 antibody','Anti-VEGF antibody', 'Aromatase inhibitor', 'Aromatase inhibitorsthird generation',
'Biosimilar', 'BRAF inhibitor', 'DNA methyltransferase inhibitor', 'Deoxycytidine analog', 'EGFR inhibitor', 'ERBB 2 inhibitor', 'Estrogen receptor inhibitor',
'Folic acid analog', 'Fluoropyrimidine', 'GnRH agonist', 'HDAC inhibitor', 'Human DNA synthesisinhibitor', 'Microtubule inhibitor', 'MTOR inhibitor',
'Nitrogen mustard', 'Nitrosourea', 'Neutral', 'PARP inhibitor', 'PARP1 inhibitor', 'PARP2 inhibitor', 'Phenothiazine', 'Platinum agent', 'Proteasome inhibitor',
'Purine analog', 'Pyrimidine analog', 'RANK ligand inhibitor', 'Selective estrogen receptor modulator', 'Somatostatin analog', 'T-cell activator',
'Targeted therapeutic', 'Taxane', 'Topoisomerase I inhibitor', 'Topoisomerase II inhibitor', 'Triazene', 'Vinca alkaloid', 'Xanthine oxidase inhibitor',
'WHO Essential Cancer Medicine']
#rxnorm = rxnorm[rxnorm['component_class_name'].isin(sact)]
props=props[props['component_class_name'].isin(sact)]
antican = props['concept_id_2']
drug_exposure=drug_exposure[drug_exposure['drug_concept_id'].isin(antican)]
#rxnorm['component_class_name'].value_counts()

In [None]:

# make labels from mapping concept IDs to concept labels
concept_lookup = {c.concept_id: c.concept_name for c in concept.itertuples()}


In [None]:
def make_labels(df):
    for c in df.columns:
        if 'concept_id' in c:
            df[c.replace('_id', '_label')] = df[c].map(concept_lookup)
        if 'concept_id' in c or 'source' in c or len(df[df[c].notna()])==0:
            df = df.drop(c, axis=1)
    return df


In [None]:
condition_occurrence_labelled = make_labels(condition_occurrence)
drug_exposure_labelled = make_labels(drug_exposure)
observation_labelled = make_labels(observation)
person_labelled = make_labels(person)
procedure_occurrence_labelled = make_labels(procedure_occurrence)

In [None]:
drug_exposure_labelled['drug_concept_label'].value_counts()

In [None]:
'''Applying extra filters to drug df'''
drug_exposure_labelled['drug_exposure_start_date'] = pd.to_datetime(drug_exposure_labelled.drug_exposure_start_date, format='%Y-%m-%d')
drug_exposure_labelled['drug_exposure_year'] = drug_exposure_labelled['drug_exposure_start_date'].dt.year
#drug_exposure_labelled.head()

In [None]:
exclusions = ['dexamethasone']
drug_exposure_labelled=drug_exposure_labelled[~drug_exposure_labelled['drug_concept_label'].isin(exclusions)]


In [None]:
'''if required, mask by a particular condition or set of conditions

# filter only by occurrences of Squamous cell carcinoma, NOS, of glottis
glottis = condition_occurrence[condition_occurrence.condition_concept_id==44500236]
# patient IDs matching this occurrence
glottis_patients = glottis.person_id.tolist()
# mask the drug exposures only by people matching the condition
mask = drug_exposure_labelled['person_id'].isin(glottis_patients)
masked = drug_exposure_labelled[mask]
'''

In [None]:
'''Data Linkage'''
person_labelled_small= person_labelled.loc[:,['person_id', 'year_of_birth', 'gender_concept_label']]
drug_persons = pd.merge(drug_exposure_labelled, person_labelled_small, on='person_id', how='left')


In [None]:
condition_labelled_small= condition_occurrence_labelled.loc[:,['person_id', 'condition_concept_label']]
condition_labelled_small['occ_number'] = 'cond_' + (condition_labelled_small.groupby('person_id').cumcount()).astype(str) 
condition_labelled_small.head()


In [None]:
#drug_persons[drug_persons['person_id']==2310508].head()

In [None]:
drug_persons['age_at_treatment'] = drug_persons['drug_exposure_year'] - drug_persons['year_of_birth']
#drug_persons[drug_persons['person_id']==2310508].head()

In [None]:
cond_pivot = condition_labelled_small.pivot(index='person_id', columns='occ_number', values='condition_concept_label').reset_index()
drug_persons = pd.merge(drug_persons, cond_pivot, on='person_id', how='left')

In [None]:
drug_persons.head()

In [None]:
'''Shrinking dataframe'''
#reduce DF down to relevant variables for the visualization
small = drug_persons[['person_id', 'drug_exposure_start_datetime', 'drug_concept_label', 'drug_exposure_year', 'gender_concept_label', 'age_at_treatment', 'cond_0', 'cond_1', 'cond_2', 'cond_3', 'cond_4', 'cond_5', 'cond_6']]
small_sorted = small.sort_values('drug_concept_label')
small['drug_concept_label'] = small_sorted.groupby(['person_id', 'drug_exposure_start_datetime'])['drug_concept_label'].transform(lambda x : ' & '.join(x))

small_nodup = small_sorted.drop_duplicates()
small['person_id'].value_counts()
#small_nodup['drug_concept_label']=small_nodup['drug_concept_label'].str.replace('& ', '&<br>')

In [None]:
# add new variable for every new drug administration per person
readministrations = pd.Series(np.zeros(len(small_nodup),dtype=int),index=small_nodup.index)

In [None]:
# Loop through all unique ids                                                                                                                                                                                      
all_id = small_nodup['person_id'].unique()
id_administrations = {}
for pid in all_id:
    # These are all the times a patient with a given ID has had surgery                                                                                                                                            
    patient = small_nodup.loc[small_nodup['person_id']==pid]
    administrations_sorted = pd.to_datetime(patient['drug_exposure_start_datetime'], format='%Y-%m-%d %H:%M:%S').sort_values()

# This checks if the previous surgery was longer than 180 days ago                                                                                                                                              
    frequency = administrations_sorted.diff()<dt.timedelta(days=6000)

    # Compute the readmission                                                                                                                                                                                      
    n_administrations = [0]
    for v in frequency.values[1:]:
       n_administrations.append((n_administrations[-1]+1)*v)

    # Add these value to the time series                                                                                                                                                                           
    readministrations.loc[administrations_sorted.index] = n_administrations


In [None]:
small_nodup['readministration'] = readministrations

In [None]:
small_nodup['drug_concept_label'] = small_nodup['drug_concept_label'] + (small_nodup['readministration'].apply(lambda x: x*' '))

In [None]:
small_nodup.head()

In [None]:
#pivot the DF from long to wide
pivoted = small_nodup.pivot(index='person_id', columns='readministration', values='drug_concept_label').reset_index()
# add the prefix 'drug' to every instance
prefixed = pivoted.add_prefix('drug')
#remove the word 'drug' from other variables
df = prefixed.rename(columns={"drugperson_id": "person_id", "readministration":"index"})
#add a value of 1 to all data points for sums in the visualization
df["count"] = 1

In [None]:
pivoted.head()

In [None]:
#df = renamed
df

In [None]:
def genSankey(df, cat_cols=[], value_cols='', title='Sankey Diagram'):
    #color palette
    #skip
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp = list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
    
    #remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    labelList2 = labelList.copy()
    for i in range(len(labelList2)):
        if labelList2[i] == labelList2[i]:
            labelList2[i] = (labelList2[i])[:3]
    
    #print(labelList2)
    colorlist = np.unique(labelList2, return_inverse=1)[1].tolist()
    for i in range(len(colorlist)):
        if colorlist[i] == colorlist[i]:
            colorlist[i] = (colorlist[i])%9
    print(colorlist)
    
    #define colors based on number of levels
    #skip

    #transform df into asource-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i], cat_cols[i+1], value_cols]]
            sourceTargetDf.columns = ['source', 'target', 'count']
        else:
            tempDf = df[[cat_cols[i], cat_cols[i+1], value_cols]]
            tempDf.columns = ['source', 'target', 'count']
            sourceTargetDf = pd.concat([sourceTargetDf, tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
    
    #add index for source-target pairs
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x:labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x:labelList.index(x))

    nodes = np.unique(sourceTargetDf[["sourceID", "targetID"]], axis=None)
    nodes = pd.Series(index=nodes, data=range(len(nodes)))

    #sankey format/color specs by Rob Raymond on StackOverflow
    fig = go.Figure(
        go.Sankey(
            node = {
                "label": labelList,
                "color":[
                    #px.colors.qualitative.Set1[df[''] % 9]
                    #for i in nodes
                    px.colors.qualitative.Set1[colorlist[i]]
                    for i in nodes
                ]
            },
            link = {
                "source": sourceTargetDf["sourceID"],
                "target": sourceTargetDf["targetID"],
                "value": sourceTargetDf["count"]
                #,
                #"color": [
                    #px.colors.qualitative.Pastel1[df[''] % 9]
                    #for i in nodes.loc[sourceTargetDf["sourceID"]]
                    #px.colors.qualitative.Pastel1[colorlist]
                    #for i in nodes.loc[sourceTargetDf["sourceID"]]
                #],
            },
        )
    )

    return fig

In [None]:
'''dash'''
app = Dash(__name__)

In [None]:
app.layout = html.Div(children=[
    html.H1(children='Sankey', style={'textAlign':'center'}),

    #Left menu
    html.Div([
        html.P(children='First Drug', style={'textAlign':'left'}),
        # first treatment
        dcc.Checklist(
            ['fluorouracil',
            'capecitabine',
            'cisplatin',
            'docetaxel',
            'gemcitabine',
            'carboplatin',
            'pembrolizumab',
            'paclitaxel',
            'allopurinol',
            'promethazine',
            'cetuximab',
            'prochlorperazine',
            'pemetrexed',
            'epirubicin',
            'etoposide',
            'prednisolone',
            'azacitidine',
            'vinorelbine',
            'cemiplimab',
            'dacarbazine',
            'rituximab',
            'leucovorin',
            'oxaliplatin',
            'zoledronic acid'],
            ['cisplatin','carboplatin'],
 id='first_treatment'
        ),
        html.P(children='Gender', style={'textAlign':'left'}),
        dcc.Checklist(
            ['MALE', 'FEMALE'],
            ['MALE', 'FEMALE'],
 id='person_gender'
        ),
        html.P(children='Treatment Year', style={'textAlign':'left'}),
        dcc.Checklist(
            [2008, 2009, 2010, 2011, 2012, 
            2013, 2014, 2015, 2016, 2017,
            2018, 2019, 2020, 2021, 2022],
            [2012, 
            2013, 2014, 2015, 2016, 2017,
            2018, 2019, 2020, 2021, 2022],
 id='treatment_year'
        )
], style={'display':'inline-block', 'width':'10%'}),

    #Right main
    html.Div(
        [
            #Graph container
            html.Div(
                dcc.Graph(
                    id='hn_sankey'
                    )
            ),

            #Slider container
            html.Div(
                dcc.Slider(
                    min=0,max=118,
                    value=10,
                    id='sankey_slider'
                    )
            )
        ], style={'display':'inline-block', 'width':'90%', 'height':'100%'}
    )
])

In [None]:
#controls
@callback(
    Output(component_id='hn_sankey', component_property='figure', allow_duplicate=True),
    Input(component_id='person_gender', component_property='value'),
    Input(component_id='treatment_year', component_property='value'),
    Input(component_id='first_treatment', component_property='value'),
    Input(component_id='sankey_slider', component_property='value'),
    prevent_initial_call=True
)
def update_graph(selected_genders, selected_years, selected_treatments, slider_value):
    #filter DF from just after administration assignments by filters
    nodup = nodup[nodup['gender_concept_label'].isin(selected_genders)]
    nodup = nodup[nodup['drug_exposure_year'].isin(selected_years)]

    #reshape DF
    #pivot the DF from long to wide
    pivoted = nodup.pivot(index='person_id', columns='readministration', values='drug_concept_label').reset_index()
    # add the prefix 'drug' to every instance
    prefixed = pivoted.add_prefix('drug')
    #remove the word 'drug' from other variables
    df = prefixed.rename(columns={"drugperson_id": "person_id", "readministration":"index"})
    #add a value of 1 to all data points for sums in the visualization
    df["count"] = 1

    #other filters
    column_names = list(df.columns.values)
    drug_num = column_names[1:slider_value+2] 
    dff = df[df['drug0'].isin(selected_treatments)]
    return genSankey(dff, cat_cols=drug_num, value_cols='count', title='Sankey Diagram')

In [None]:
nodup = small_nodup

In [None]:
#run
if __name__ == '__main__':
    app.run(debug=True)