In [1]:

import altair as alt
import pandas as pd
import copy
import json
alt.data_transformers.enable('csv')
alt.renderers.enable('default')

vis_size = 400

In [2]:
def normalizeColumn(dataframe, column_name,real_size, synt_size):
    df = copy.deepcopy(dataframe)
    series = df.groupby([column_name,"Real"]).size()
    index = series.index
    df1 = pd.DataFrame(columns=[column_name,'Real','count','prob'])

    for i in range(len(series)):
        row = {column_name:index[i][0],'Real':index[i][1], 'count':series[i], 'prob': series[i]}
        df1 = df1.append(row, ignore_index=True)

    synt = df1[df1['Real']==0]
    real = df1[df1['Real']==1]

    real['prob'] = real['count'].transform(lambda x: x/real_size)
    synt['prob'] = synt['count'].transform(lambda x: x/synt_size)
    return real.append(synt)

def generateDistributionVisualization(df,size,column,data_type):
    selection = alt.selection_multi(fields=['Real'], bind='legend', empty="all")
    return alt.Chart(df).mark_bar().encode(
        x=alt.X(
            field=column,
            type=data_type,
            sort= "-y" if data_type == "nominal" else None),
        y=alt.Y("prob:Q", stack=None),
        color=alt.Color(
            'Real:N',
            scale=alt.Scale( range=['#af8dc3', '#7fbf7b']), 
            title="Type",
            legend=alt.Legend(labelExpr="datum.value == 0 ? 'Synthetic' : 'Real'")),
        opacity=alt.condition(selection, alt.value(0.5), alt.value(0))
    ).properties(height=size,width=size, title=column
    ).add_selection(
        selection
    )

In [3]:
## GENDER AND YEAROFBIRTH ##
df_pop = pd.read_csv("../data/CleanData/EHRPatientClean.csv")

real_size = len(df_pop[df_pop.Real==1])
synt_size = len(df_pop[df_pop.Real==0])


df_year_of_birth = normalizeColumn(df_pop, "YearOfBirth",real_size,synt_size)
df_gender = normalizeColumn(df_pop, "Gender",real_size,synt_size)

gender_chart = generateDistributionVisualization(df_gender,vis_size,"Gender", "nominal")
year_of_birth_chart = generateDistributionVisualization(df_year_of_birth,vis_size,"YearOfBirth","ordinal")

In [4]:
## DIAGNOSIS ##
df_diagnosis = pd.read_csv("../data/CleanData/EHRDiagnosisClean.csv")

real_diag_size = len(df_diagnosis[df_diagnosis.Real==1])
synt_diag_size = len(df_diagnosis[df_diagnosis.Real==0])
df_diagnosis = normalizeColumn(df_diagnosis,"Diagnosis",real_diag_size, synt_diag_size)

diagnosis_chart = generateDistributionVisualization(df_diagnosis,vis_size,"Diagnosis","nominal")

In [5]:
## ALLERGY ##
df2 = pd.read_csv("../data/CleanData/EHRAllergyClean.csv") 

real_allergy_size = len(df2[df2.Real==1])
synt_allergy_size = len(df2[df2.Real==0])

df_allergy = normalizeColumn(df2,"Allergy", real_allergy_size,synt_allergy_size)
allergy_chart = generateDistributionVisualization(df_allergy,vis_size,"Allergy","nominal")

In [6]:
final = alt.vconcat(diagnosis_chart,year_of_birth_chart,gender_chart,allergy_chart,
resolve = alt.Resolve(scale=alt.LegendResolveMap(color=alt.ResolveMode('independent'), opacity=alt.ResolveMode('independent'))))
final

In [663]:
final.save("ehr_comp.json")