In [1]:
import os
import csv
import umap
import numpy as np
import pandas as pd
import matplotlib
from bokeh.plotting import figure, output_file, save, ColumnDataSource, show, output_notebook
from bokeh.models import CategoricalColorMapper, HoverTool

In [2]:
DATA_PATH = os.path.expanduser('~/data1/stratification_ILRM/data_v3')
data_folder = 'experiments/ehr-804370-test-1'
# data_folder = 'experiments/ehr-804371-test-2'

oneMdata_folder = 'experiments/ehr-1608741'

dm_file = 'patient-details.csv'
# enc_file_tr = 'encodings/TRconvae-avg_vect.csv'
enc_file = 'encodings/convae-avg_vect.csv'
cl_lab_snomed = 'encodings/cl-subsampling/outer-cl-convae-snomed-it60.txt'
# cl_lab_ccs = 'encodings/outer-cl-convae-ccs-single.txt'

In [3]:
ds_snomed = 'snomed_subsampling/patient-5000-disease-snomed-it60.csv'
# ds_ccs = 'patient-5000-disease-ccs-single.csv'

In [4]:
col_dict = matplotlib.colors.CSS4_COLORS
c_out = ['bisque', 'mintcream', 'cornsilk', 'lavenderblush', 'aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure',
         'beige', 'powderblue', 'floralwhite', 'ghostwhite', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow',
         'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue',
         'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'linen', 'palegoldenrod', 'palegreen',
         'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'mistyrose', 'lemonchiffon', 'lightblue',
         'seashell', 'white', 'blanchedalmond', 'oldlace', 'moccasin', 'snow', 'darkgray', 'ivory', 'whitesmoke']

In [5]:
def outer_viz(ds_dic, 
              enc_data, 
              cl_lab, 
              mrn=None):

    with open(os.path.join(DATA_PATH, ds_dic)) as f:
        rd = csv.reader(f)
        next(rd)
        snomed_dct = {r[0]: r[1] for r in rd}
    
# with open(os.path.join(DATA_PATH, data_folder, enc_file_tr)) as f:
#     rd = csv.reader(f)
#     convae_mtx_tr = [r[1::] for r in rd]
    if mrn is None:
        with open(os.path.join(DATA_PATH, data_folder, enc_data)) as f:
            rd = csv.reader(f)
            next(rd)
            mrn = []
            convae_mtx = []
            for r in rd:
                if r[0] in snomed_dct:
                    mrn.append(r[0])
                    convae_mtx.append(r[1::])

    with open(os.path.join(DATA_PATH, data_folder, cl_lab)) as f:
        pred_cl = f.read().splitlines()

    dis_pt = [snomed_dct[m] for m in mrn]
    unique, counts = np.unique(dis_pt, return_counts=True)
    for a, b in dict(zip(unique, counts)).items():
        print(a, b)

    umap_tr = umap.UMAP(random_state=42, n_neighbors=200, min_dist=0.0)
# umap_fit = umap_tr.fit(convae_mtx_tr)
    umap_mtx = umap_tr.fit_transform(convae_mtx)

    print(len(umap_mtx), len(dis_pt), umap_mtx.shape)

    df_dict = {'mrn': mrn, 'x': umap_mtx[:,0].tolist(), 'y': umap_mtx[:,1].tolist(), 'ds_class': dis_pt}
    df = pd.DataFrame(df_dict)

    df_dict_pred = {'mrn': mrn, 'x': umap_mtx[:,0].tolist(), 'y': umap_mtx[:,1].tolist(), 'ds_class': pred_cl}
    df_pred = pd.DataFrame(df_dict_pred)
    dict_lab = {dis: n for n, dis in enumerate(sorted(list(set(dis_pt))))}
    colormap = [c for c in col_dict if c not in c_out]
    colormap_rid = [colormap[dict_lab[dis]] for _, dis in enumerate(sorted(list(set(dis_pt))))]
    colormap_rid_pred = [colormap[int(cl)] for cl in sorted(list(set(pred_cl)))]
    
    scatter_plot(df, colormap_rid)
    scatter_plot(df_pred, colormap_rid_pred)
    
#     with open(os.path.join(indir, 'person-demographics.csv')) as f:
#         rd = csv.reader(f)
#         next(rd)
#         dem = {r[0]: r[1::] for r in rd}

#     df_ar = []
#     for id_name, coord, cl_lab in zip(id_subj, umap_mtx, best_lab_cl):
#         df_ar.append([id_name, coord[0], coord[1], cl_lab, age(dem[id_name][0]),
#                           dem[id_name][2], dem[id_name][3]])
#     df_ar = np.array(df_ar)
#     df = pd.DataFrame(df_ar, columns=['id_subj', 'x', 'y', 'cluster', 'age', 'sex', 'n_enc'])
#     df['x'] = df['x'].astype('float64')
#     df['y'] = df['y'].astype('float64')
#     df['age'] = df['age'].astype('float64')
#     df['n_enc'] = df['n_enc'].astype('int')

#     p_clu = {}
#     with open(os.path.join(datadir, 'person-cluster.txt'), 'w') as f:
#         wr = csv.writer(f)
#         wr.writerow(['ID_LAB', 'CLUSTER'])
#         for el in df_ar:
#             wr.writerow([el[0], el[3]])
#             p_clu[el[0]] = el[3]
def scatter_plot(df, col):
    source = ColumnDataSource(dict(
    x=df['x'].tolist(),
    y=df['y'].tolist(),
    mrn=df['mrn'].tolist(),
    ds_class=[str(i) for i in df['ds_class'].tolist()]))
#         age=df['age'].tolist(),
#         sex=df['sex'].tolist(),
#         n_enc=df['n_enc'].tolist()))

    labels = [str(i) for i in df['ds_class'].tolist()]
    cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), palette=col)

    TOOLTIPS = [('mrn', '@mrn'),
                ('ds_class', '@ds_class'),
#                 ('sex', '@sex'),
#                 ('age', '@age'),
#                 ('n_enc', '@n_enc')
               ]

    plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'

#     output_file(filename=os.path.join(datadir, 'tfidf-plot-interactive.html'), mode='inline')
    output_notebook()
    p = figure(plot_width=800, plot_height=800, tools=plotTools)
    p.add_tools(HoverTool(tooltips=TOOLTIPS))
    p.circle('x', 'y', legend='ds_class', source=source, color={"field": 'ds_class', "transform": cmap})
    p.xaxis.major_tick_line_color = None
    p.xaxis.minor_tick_line_color = None
    p.yaxis.major_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.xaxis.major_label_text_color = None
    p.yaxis.major_label_text_color = None
    p.grid.grid_line_color = None
    show(p)

## Internal clustering validation visualization separately for each disease (T2D, AD, PD, MM)

In [5]:
def inner_viz(insp_dis):

    with open(os.path.join(DATA_PATH, oneMdata_folder, 
                           'cohort-{0}-innerval-labels.csv'.format(insp_dis))) as f:
        rd = csv.reader(f)
        mrn_subcl = {r[0]: r[1] for r in rd}
    with open(os.path.join(DATA_PATH, oneMdata_folder,
                           enc_file)) as f:
        rd = csv.reader(f)
        next(rd)
        for r in rd:
            if r[0] in list(mrn_subl.keys()):
                enc_data[r[0]] = r[1::]

    convae_mtx = list(enc_data.values())
    mrn = list(enc_data.keys())

    umap_tr = umap.UMAP(random_state=42, n_neighbors=200, min_dist=0.0)
    umap_mtx = umap_tr.fit_transform(convae_mtx)

    print(len(umap_mtx), len(dis_pt), umap_mtx.shape)

    df_dict = {'mrn': mrn, 
               'x': umap_mtx[:,0].tolist(), 
               'y': umap_mtx[:,1].tolist(), 
               'ds_class': [mrn_subcl[m] for m in mrn]}
    df = pd.DataFrame(df_dict)

    dict_lab = {dis: n for n, dis in enumerate(sorted(list(set(mrn_subcl.values()))))}
    colormap = [c for c in col_dict if c not in c_out]
    colormap_rid = [colormap[dict_lab[dis]] for _, dis in enumerate(sorted(list(set(dis_pt))))]
    colormap_rid_pred = [colormap[int(cl)] for cl in sorted(list(set(pred_cl)))]
    
    scatter_plot(df, colormap_rid, insp_dis)
    
#     with open(os.path.join(indir, 'person-demographics.csv')) as f:
#         rd = csv.reader(f)
#         next(rd)
#         dem = {r[0]: r[1::] for r in rd}

#     df_ar = []
#     for id_name, coord, cl_lab in zip(id_subj, umap_mtx, best_lab_cl):
#         df_ar.append([id_name, coord[0], coord[1], cl_lab, age(dem[id_name][0]),
#                           dem[id_name][2], dem[id_name][3]])
#     df_ar = np.array(df_ar)
#     df = pd.DataFrame(df_ar, columns=['id_subj', 'x', 'y', 'cluster', 'age', 'sex', 'n_enc'])
#     df['x'] = df['x'].astype('float64')
#     df['y'] = df['y'].astype('float64')
#     df['age'] = df['age'].astype('float64')
#     df['n_enc'] = df['n_enc'].astype('int')

#     p_clu = {}
#     with open(os.path.join(datadir, 'person-cluster.txt'), 'w') as f:
#         wr = csv.writer(f)
#         wr.writerow(['ID_LAB', 'CLUSTER'])
#         for el in df_ar:
#             wr.writerow([el[0], el[3]])
#             p_clu[el[0]] = el[3]
def scatter_plot(df, col, insp_dis):
    source = ColumnDataSource(dict(
    x=df['x'].tolist(),
    y=df['y'].tolist(),
    mrn=df['mrn'].tolist(),
    ds_class=[str(i) for i in df['ds_class'].tolist()]))
#         age=df['age'].tolist(),
#         sex=df['sex'].tolist(),
#         n_enc=df['n_enc'].tolist()))

    labels = [str(i) for i in df['ds_class'].tolist()]
    cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), palette=col)

    TOOLTIPS = [('mrn', '@mrn'),
                ('ds_class', '@ds_class'),
#                 ('sex', '@sex'),
#                 ('age', '@age'),
#                 ('n_enc', '@n_enc')
               ]

    plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'

#     output_file(filename=os.path.join(datadir, 'tfidf-plot-interactive.html'), mode='inline')
    output_notebook()
    p = figure(plot_width=800, plot_height=800, tools=plotTools, 
               title="Subclusters disease {0}".format(insp_dis))
    p.add_tools(HoverTool(tooltips=TOOLTIPS))
    p.circle('x', 'y', legend='ds_class', source=source, color={"field": 'ds_class', "transform": cmap})
    p.xaxis.major_tick_line_color = None
    p.xaxis.minor_tick_line_color = None
    p.yaxis.major_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.xaxis.major_label_text_color = None
    p.yaxis.major_label_text_color = None
    p.grid.grid_line_color = None
    show(p)

In [6]:
outer_viz(ds_snomed, enc_file, cl_lab_snomed)

ADHD 2559
PD 2490
AD 2509
BC 2570
MM 1947
T2D 2534
CD 2469
PC 2501
19579 19579 (19579, 2)


In [7]:
inner_viz('T2D')

In [None]:
inner_viz('MM')

In [None]:
inner_viz('PD')

In [None]:
inner_viz('AD')

# Viz baselines

In [25]:
DATA_PATH = os.path.expanduser('~/data1/stratification_ILRM/data_v3')
data_folder = 'experiments/ehr-804371-test-2'
# data_folder = 'experiments/ehr-804371-test-2'

dm_file = 'patient-details.csv'
# enc_file_tr = 'encodings/TRconvae-avg_vect.csv'
enc_file = 'encodings/tfidf-mtx.npy'
mrn_file = 'encodings/bs-mrn.txt'
cl_lab_snomed = 'encodings/cl-subsampling/outer-cl-tfidf-snomed-it59.txt'
# cl_lab_ccs = 'encodings/outer-cl-convae-ccs-single.txt'

In [26]:
ds_snomed = 'snomed_subsampling/patient-5000-disease-snomed-it59.csv'
# ds_ccs = 'patient-5000-disease-ccs-single.csv'

In [27]:
col_dict = matplotlib.colors.CSS4_COLORS
c_out = ['bisque', 'mintcream', 'cornsilk', 'lavenderblush', 'aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure',
         'beige', 'powderblue', 'floralwhite', 'ghostwhite', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow',
         'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue',
         'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'linen', 'palegoldenrod', 'palegreen',
         'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'mistyrose', 'lemonchiffon', 'lightblue',
         'seashell', 'white', 'blanchedalmond', 'oldlace', 'moccasin', 'snow', 'darkgray', 'ivory', 'whitesmoke']

In [28]:
def outer_viz(ds_dic, 
              enc_data, 
              cl_lab, 
              mrn=mrn_file):

    with open(os.path.join(DATA_PATH, ds_dic)) as f:
        rd = csv.reader(f)
        next(rd)
        snomed_dct = {r[0]: r[1] for r in rd}
    
# with open(os.path.join(DATA_PATH, data_folder, enc_file_tr)) as f:
#     rd = csv.reader(f)
#     convae_mtx_tr = [r[1::] for r in rd]
    if mrn is None:
        with open(os.path.join(DATA_PATH, data_folder, enc_data)) as f:
            rd = csv.reader(f)
            next(rd)
            mrn = []
            convae_mtx = []
            for r in rd:
                if r[0] in snomed_dct:
                    mrn.append(r[0])
                    convae_mtx.append(r[1::])
    else:
        convae_mtx = np.load(os.path.join(DATA_PATH, data_folder, enc_data))
        convae_list = []
        with open(os.path.join(DATA_PATH, data_folder, mrn)) as f:
            rd = csv.reader(f)
            mrn = []
            for idx, r in enumerate(rd):
                if r[0] in snomed_dct:
                    mrn.append(r[0])
                    convae_list.append(list(convae_mtx[idx]))
        convae_mtx = np.array(convae_list)
        print(len(mrn))
        print(convae_mtx.shape)
        
    with open(os.path.join(DATA_PATH, data_folder, cl_lab)) as f:
        pred_cl = f.read().splitlines()

    dis_pt = [snomed_dct[m] for m in mrn]
    unique, counts = np.unique(dis_pt, return_counts=True)
    for a, b in dict(zip(unique, counts)).items():
        print(a, b)

    umap_tr = umap.UMAP(random_state=42, n_neighbors=200, min_dist=0.0)
# umap_fit = umap_tr.fit(convae_mtx_tr)
    umap_mtx = umap_tr.fit_transform(convae_mtx)

    print(len(umap_mtx), len(dis_pt), umap_mtx.shape)

    df_dict = {'mrn': mrn, 'x': umap_mtx[:,0].tolist(), 'y': umap_mtx[:,1].tolist(), 'ds_class': dis_pt}
    df = pd.DataFrame(df_dict)

    df_dict_pred = {'mrn': mrn, 'x': umap_mtx[:,0].tolist(), 'y': umap_mtx[:,1].tolist(), 'ds_class': pred_cl}
    df_pred = pd.DataFrame(df_dict_pred)
    dict_lab = {dis: n for n, dis in enumerate(sorted(list(set(dis_pt))))}
    colormap = [c for c in col_dict if c not in c_out]
    colormap_rid = [colormap[dict_lab[dis]] for _, dis in enumerate(sorted(list(set(dis_pt))))]
    colormap_rid_pred = [colormap[int(cl)] for cl in sorted(list(set(pred_cl)))]
    
    scatter_plot(df, colormap_rid)
    scatter_plot(df_pred, colormap_rid_pred)
    
#     with open(os.path.join(indir, 'person-demographics.csv')) as f:
#         rd = csv.reader(f)
#         next(rd)
#         dem = {r[0]: r[1::] for r in rd}

#     df_ar = []
#     for id_name, coord, cl_lab in zip(id_subj, umap_mtx, best_lab_cl):
#         df_ar.append([id_name, coord[0], coord[1], cl_lab, age(dem[id_name][0]),
#                           dem[id_name][2], dem[id_name][3]])
#     df_ar = np.array(df_ar)
#     df = pd.DataFrame(df_ar, columns=['id_subj', 'x', 'y', 'cluster', 'age', 'sex', 'n_enc'])
#     df['x'] = df['x'].astype('float64')
#     df['y'] = df['y'].astype('float64')
#     df['age'] = df['age'].astype('float64')
#     df['n_enc'] = df['n_enc'].astype('int')

#     p_clu = {}
#     with open(os.path.join(datadir, 'person-cluster.txt'), 'w') as f:
#         wr = csv.writer(f)
#         wr.writerow(['ID_LAB', 'CLUSTER'])
#         for el in df_ar:
#             wr.writerow([el[0], el[3]])
#             p_clu[el[0]] = el[3]
def scatter_plot(df, col):
    source = ColumnDataSource(dict(
    x=df['x'].tolist(),
    y=df['y'].tolist(),
    mrn=df['mrn'].tolist(),
    ds_class=[str(i) for i in df['ds_class'].tolist()]))
#         age=df['age'].tolist(),
#         sex=df['sex'].tolist(),
#         n_enc=df['n_enc'].tolist()))

    labels = [str(i) for i in df['ds_class'].tolist()]
    cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), palette=col)

    TOOLTIPS = [('mrn', '@mrn'),
                ('ds_class', '@ds_class'),
#                 ('sex', '@sex'),
#                 ('age', '@age'),
#                 ('n_enc', '@n_enc')
               ]

    plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'

#     output_file(filename=os.path.join(datadir, 'tfidf-plot-interactive.html'), mode='inline')
    output_notebook()
    p = figure(plot_width=800, plot_height=800, tools=plotTools)
    p.add_tools(HoverTool(tooltips=TOOLTIPS))
    p.circle('x', 'y', legend='ds_class', source=source, color={"field": 'ds_class', "transform": cmap})
    p.xaxis.major_tick_line_color = None
    p.xaxis.minor_tick_line_color = None
    p.yaxis.major_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.xaxis.major_label_text_color = None
    p.yaxis.major_label_text_color = None
    p.grid.grid_line_color = None
    show(p)

In [29]:
outer_viz(ds_snomed, enc_file, cl_lab_snomed, mrn_file)

19347
(19347, 100)
T2D 2509
MM 1935
PC 2506
PD 2509
ADHD 2457
BC 2466
CD 2478
AD 2487
19347 19347 (19347, 2)
