In [1]:
import pandas as pd
import numpy as np
import collections 
import itertools
import pickle
import math
import networkx as nx
from pyvis.network import Network
import ast
import sys
sys.path.append('../')
from modules import utils
import matplotlib.pyplot as plt
%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


In [2]:
def streamline_frequency_dict(frequency_dict):
    frequency_dict_list = [] #will hold dictonaries with 2 keys i.e. 'set' and 'value'
    for path_dict, value in frequency_dict.items():
        all_set_list = [i['set'] for i in frequency_dict_list] #get all the sets so far in the list 
        path_set = set(ast.literal_eval(path_dict)) #get the set of current path dict to check if its already in list
        if path_set in all_set_list: #increase value else insert it - work on this!!!!!!!!!!!!!!!
            for elem in frequency_dict_list: #look for if path_set is already in frequency_dict_list 
                if elem['set'] == path_set: #find the matching path_set in the frequency_dict_list
                    elem['value'] += value #increase the value for that set
        else:
            frequency_dict_list.append({'set':path_set, 'value':value}) #introdue the new set in the list
    
    all_list = [list(path_dict['set']) for path_dict in frequency_dict_list] # list of all the set(pathways)
    flat_list = [item for sublist in all_list for item in sublist] #flatten list to get commonest items
    commonest_elements = dict(collections.Counter(flat_list)) #the most frequent items in the list
    commonest_elements = {k: v for k, v in sorted(commonest_elements.items(), reverse=True, key=lambda item: item[1])}
    commonest_elements_list = list(commonest_elements.keys())
    commonest_elements_list = [i for i in commonest_elements_list if i not in ['Lupus', 'No lupus', 'Inconclusive diagnosis']] + ['Lupus', 'No lupus', 'Inconclusive diagnosis']
    
    for item in frequency_dict_list:
        item['set'] = sorted(list(item['set']), key=lambda x: commonest_elements_list.index(x))
    
    keys = [str(i['set']) for i in frequency_dict_list]
    values = [i['value'] for i in frequency_dict_list]
    final_frequency_dict = {k:v for (k,v) in zip(keys, values)}
    
    return final_frequency_dict

In [3]:
def generate_tuple_dict(df):
    frequency_dict = {}
    for traj in df.trajectory:
        if traj in frequency_dict.keys():
            frequency_dict[traj] += 1
        else:
            frequency_dict[traj] = 1
    streamlined_frequency_dict = streamline_frequency_dict(frequency_dict)
    overall_tup_dict = {}
    for key, value in frequency_dict.items():
        new_key = ast.literal_eval(key)
        for tup in zip(new_key, new_key[1:]):
            if tup in overall_tup_dict.keys():
                overall_tup_dict[tup] += value
            else:
                overall_tup_dict[tup] = value
    return overall_tup_dict

In [4]:
def create_sankey_df(df):
    overall_tup_dict = generate_tuple_dict(df)
    sankey_df = pd.DataFrame()
    sankey_df['source'] = [i[0] for i in overall_tup_dict.keys()]
    sankey_df['target'] = [i[1] for i in overall_tup_dict.keys()]
    sankey_df['value'] = list(overall_tup_dict.values())
    sankey_df['link_type'] = sankey_df['target'].apply(lambda i: 'terminal' if i in ['No lupus', 'Lupus', 'Inconclusive diagnosis'] else 'non_terminal')
    return sankey_df

In [5]:
def get_threshold_value(row):
    substring = f"'{row.source}', '{row.target}'"
    substring_df = pd.DataFrame()
    for i, test_row in test_df.iterrows():
        if substring in test_row.trajectory:
            substring_df = substring_df.append(test_row)
    substring_testing_df = testing_df.loc[substring_df.index]
    
    threshold_values = substring_testing_df[row.source].unique().tolist()
    if len(threshold_values) == 1:
        return int(threshold_values[0])
    else:
#         print(f'There is {len(threshold_values)} threshold values for the edge between {row.source} and {row.target}')
        return str(threshold_values)

In [6]:
def draw_pyvis_network(pathways_df):
#     pathways_df = create_sankey_df(test_df)
    pathways_df['type'] = 'directed'
#     start_node = 'No lupus'
    got_net = Network(notebook=True, height='750px', width='100%', directed=True, cdn_resources='in_line')
    got_net.add_node(start_node, color='purple', size=20)
    got_net.add_nodes(non_terminal_nodes, size=[15]*len(non_terminal_nodes), color=['blue']*len(non_terminal_nodes))
    got_net.add_nodes(terminal_nodes, color=['green']*len(terminal_nodes), size=[20]*len(terminal_nodes))
    for src, target, value in zip(pathways_df.source, pathways_df.target, pathways_df.value):
        if value > threshold:
            got_net.add_edge(src, target, value=value, color='red')
        else:
            got_net.add_edge(src, target, value=value, color='blue')
    return got_net

In [7]:
def draw_labelled_pyvis_network(pathways_df, pathway_type='network'): #second type is trajectory 
    pathways_df['type'] = 'directed'
    pathways_df['edge_threshold'] = pathways_df.apply(lambda row: get_threshold_value(row), axis=1)
    
    #change get_net to something else since it satnds for Game of Thrones
    got_net = Network(notebook=True, height='750px', width='100%', directed=True, cdn_resources='in_line')
    got_net.add_node(start_node, color='purple', size=20)
    got_net.add_nodes(non_terminal_nodes, size=[15]*len(non_terminal_nodes), color=['blue']*len(non_terminal_nodes))
    got_net.add_nodes(terminal_nodes, color=['green']*len(terminal_nodes), size=[20]*len(terminal_nodes))
    for src, target, value, edge_thresh in zip(pathways_df.source, pathways_df.target, pathways_df.value, pathways_df.edge_threshold):
        if value > threshold: #this is what mainly changes
            if isinstance(edge_thresh, str):
                got_net.add_edge(src, target, value=value, color='red', label=edge_thresh)
            else:
                got_net.add_edge(src, target, value=value, color='red', label=str(int(edge_thresh)))
        else:
            got_net.add_edge(src, target, value=value, color='blue')
    return got_net

In [8]:
def get_threshold_value_path(df, source): #source e.g. 'ana'
    threshold_values = df[source].unique().tolist()
    if len(threshold_values) == 1:
        return int(threshold_values[0])
    else:
        return str(threshold_values)

In [9]:
def create_networkx_graph(test_df, testing_df, save=False, filename=None):
    sankey_df = create_sankey_df(test_df)
    sankey_df['edge_threshold'] = sankey_df.apply(lambda row: get_threshold_value(row), axis=1)
    test_df_graph = nx.from_pandas_edgelist(sankey_df, 'source', 'target', 'edge_threshold')
    if save:
        pickle.dump(test_df_graph, open(f'{filename}.pickle', 'wb'))
    return test_df_graph, sankey_df

In [10]:
def draw_one_trajectory(pred_df, trajectory): #pred_df = pred_no_lupus par exemple
    trajectory_df = pred_df[pred_df.trajectory == trajectory]
    trajectory_testing_df = testing_df.loc[trajectory_df.index]
    trajectory_pathways_df = create_sankey_df(trajectory_df)
    trajectory_pathways_df['edge_threshold'] = [get_threshold_value(trajectory_testing_df, source) for source in trajectory_pathways_df.source]
    trajectory_net = draw_pyvis_network(trajectory_pathways_df, 'trajectory')   
    return trajectory_net

In [11]:
def create_networkx_graph_path(pathway, test_df, save=False, filename=None):
    pathway_df = test_df[test_df.trajectory==pathway]
    pathway_testing_df = testing_df.loc[pathway_df.index]
    sankey_df = create_sankey_df(pathway_df)
    sankey_df['edge_threshold'] = [get_threshold_value_path(pathway_testing_df, source) for source in sankey_df.source]
    pathway_graph = nx.from_pandas_edgelist(sankey_df, 'source', 'target', 'edge_threshold')
    if save:
        pickle.dump(test_df_graph, open(f'{filename}.pickle', 'wb'))
#     return pathway_graph, sankey_df
    return None

#### Preliminaries

In [12]:
model_name = 'dueling_dqn_per'
seed = 126
steps = int(10e7)
folder_name = f'../graphs/risk_factor/seed_{seed}'

In [13]:
testing_df = pd.read_csv('../data/test_set_constant.csv')
testing_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
2,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,1,0,0,0,1
3,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,1


In [14]:
test_df = pd.read_csv(f'../test_dfs/{model_name}_risk_factor_{seed}_{steps}.csv')
test_df.head()

Unnamed: 0,index,episode_length,reward,y_pred,y_actual,trajectory,terminated,is_success
0,0.0,10.0,0.574529,1.0,1.0,"['ana', 'pericardial_effusion', 'joint_involve...",0.0,1.0
1,1.0,17.0,0.306122,1.0,1.0,"['ana', 'pericardial_effusion', 'cutaneous_lup...",0.0,1.0
2,2.0,17.0,0.296348,1.0,1.0,"['ana', 'pericardial_effusion', 'joint_involve...",0.0,1.0
3,3.0,17.0,0.304535,1.0,1.0,"['ana', 'pericardial_effusion', 'cutaneous_lup...",0.0,1.0
4,4.0,15.0,0.390528,1.0,1.0,"['ana', 'pericardial_effusion', 'joint_involve...",0.0,1.0


In [15]:
threshold = 0.1*len(test_df)

In [16]:
utils.test(test_df.y_actual, test_df.y_pred)[0]

97.74285714285715

#### Creating graph for entire test df

In [17]:
test_filename = f'{folder_name}/{model_name}_test_df'
test_df_graph, test_sankey_df = create_networkx_graph(test_df, testing_df, True, test_filename)

#### Creating graph for no lupus test df

In [18]:
pred_no_lupus = test_df[test_df.y_pred==0]
no_lupus_testing_df = testing_df.loc[pred_no_lupus.index]
len(pred_no_lupus), len(no_lupus_testing_df)

(7487, 7487)

In [19]:
no_lupus_filename = f'{folder_name}/{model_name}_no_lupus'
no_lupus_df_graph, no_lupus_sankey_df = create_networkx_graph(pred_no_lupus, no_lupus_testing_df, True, no_lupus_filename)

#### Creating graph for lupus test_df

In [20]:
pred_lupus = test_df[test_df.y_pred==1]
lupus_testing_df = testing_df.loc[pred_lupus.index]
len(pred_lupus), len(lupus_testing_df)

(6513, 6513)

In [21]:
lupus_filename = f'{folder_name}/{model_name}_lupus'
lupus_df_graph, lupus_sankey_df = create_networkx_graph(pred_lupus, lupus_testing_df, True, lupus_filename)

#### Shortest paths

In [22]:
short_lupus_path = pred_lupus[pred_lupus.episode_length==pred_lupus.episode_length.min()].trajectory.value_counts().index[0]
short_lupus_filename = f'{folder_name}/{model_name}_lupus_shortest'
create_networkx_graph_path(short_lupus_path, pred_lupus, True, short_lupus_filename)

In [23]:
short_no_lupus_path = pred_no_lupus[pred_no_lupus.episode_length == pred_no_lupus.episode_length.min()].trajectory.value_counts().index[0]
short_no_lupus_filename = f'{folder_name}/{model_name}_no_lupus_shortest'
create_networkx_graph_path(short_no_lupus_path, pred_no_lupus, True, short_no_lupus_filename)

#### Longest paths 

In [24]:
long_lupus_path = pred_lupus[pred_lupus.episode_length==pred_lupus.episode_length.max()].trajectory.value_counts().index[0]
long_lupus_filename = f'{folder_name}/{model_name}_lupus_longest'
create_networkx_graph_path(long_lupus_path, pred_lupus, True, long_lupus_filename)

In [25]:
long_no_lupus_path = pred_no_lupus[pred_no_lupus.episode_length == pred_no_lupus.episode_length.max()].trajectory.value_counts().index[0]
long_no_lupus_filename = f'{folder_name}/{model_name}_no_lupus_longest'
create_networkx_graph_path(long_no_lupus_path, pred_no_lupus, True, long_no_lupus_filename)

#### Commonest paths

In [26]:
common_lupus_path = pred_lupus.trajectory.value_counts().index[0]
common_lupus_filename = f'{folder_name}/{model_name}_lupus_commonest'
create_networkx_graph_path(common_lupus_path, pred_lupus, True, common_lupus_filename)

In [27]:
common_no_lupus_path = pred_no_lupus.trajectory.value_counts().index[0]
common_no_lupus_filename = short_no_lupus_filename = f'{folder_name}/{model_name}_no_lupus_commonest'
create_networkx_graph_path(common_no_lupus_path, pred_no_lupus, True, common_no_lupus_filename)