# Intro
* connect to a filtered sql database (from filter.ipynb)
* select desired variables (using code and VI)
* visualize
* Export to NEW format
    * train, test split
    * which is a dict of
        * times: list of N*np.array(L) where L is # of sampling events for each patient
        * events: list of N*np.array(L,K) where K is # of sampling events
        * states: list of N*np.array(L,S) where S is # of states
        


# Libraries

In [1]:
# activate line execution
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# general
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plotly
import plotly.express as px  # (version 4.7.0 or higher)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# import custom libraries
import sys
# sys.path.append("C:\\DATA\\Tasks\\lib\\hk")
# import hk_psql
# import hk_mimic

# folder paths
ADD_LOOKUP = "C:\\DATA\\data\\raw\\mimic4\\lookup\\"
ADD_DATA = "C:\\DATA\\data\\raw\\neuralTPP\\data\\Stack Overflow\\so\\baseline\\so\\split_1\\"

In [2]:
import tqdm
import json
from sklearn.model_selection import train_test_split
import pickle




# Functions

In [3]:
def df2pkl(df, add_data, num_types, num_marks=None, dict_map_events=None, w_class=None, pos_weight=None):

    # df is a dataframe with 3 columns: id, time and event (could be a int or onehot encodings(only multilabel case))

    dict_add = {}
    if num_marks is not None:
        dict_add.update({'num_marks':num_marks})
    if dict_map_events is not None:
        dict_add.update({'dict_map_events':dict_map_events})
    if w_class is not None:
        dict_add.update({'w_class':w_class})
    if pos_weight is not None:
        dict_add.update({'pos_weight':pos_weight})



    df['dict'] = df[['time','event']].apply(lambda x:{'time_since_start':x.time,'time_since_last_event':x.time, 'type_event':x['event']}, axis=1)

    df = df.groupby('id')['dict'].apply(lambda x:x.tolist()).reset_index(name='dict_event')


    X_train, X_test_dev = train_test_split(df, train_size=0.6, random_state=42)
    X_test, X_dev = train_test_split(X_test_dev, train_size=0.5, random_state=42)

    dict_train_event = {'dim_process':num_types, 'train':X_train['dict_event'].tolist(),'test':{}, 'dev':{}}
    dict_test_event = {'dim_process':num_types, 'test':X_test['dict_event'].tolist(),'train':{}, 'dev':{}}
    dict_dev_event = {'dim_process':num_types, 'dev':X_dev['dict_event'].tolist(),'train':{}, 'test':{}}

    dict_train_event.update(dict_add)
    dict_test_event.update(dict_add) 
    dict_dev_event.update(dict_add) 

    with open(add_data + "/train.pkl",'wb') as f:
        pickle.dump(dict_train_event, f)
    with open(add_data + "/test.pkl",'wb') as f:
        pickle.dump(dict_test_event, f)
    with open(add_data + "/dev.pkl",'wb') as f:
        pickle.dump(dict_dev_event, f)

    return

# SO

In [None]:


dict_seg = {'train':[],'test':[],'val':[]}

for seg in ['train']:
    for i_split in range(1):
        
        ADD_DATA = f"C:\\DATA\\data\\raw\\neuralTPP\\data\\Stack Overflow\\so\\baseline\\so\\split_{i_split+1}\\"
        with open(ADD_DATA+f'{seg}.json') as f:
            data = json.load(f)

        n_seq = len(data)
        
        list_events=[]
        for i in range(n_seq):
            temp = [event['labels'] for event in data[i]]
            list_events.extend(temp)
    
    all_events = np.concatenate(list_events)
    label, count = np.unique(all_events,return_counts=True)    
    temp = np.round(count/count.sum()*100,2)
    dict_seg[seg]=temp

px.bar(temp)

print('Labels with less than 1%')
label[temp<0.5]

In [None]:
labels2remove = [10, 14, 15, 16, 18, 19, 20, 21]

for seg in ['train']:
    for i_split in range(5):
        
        ADD_DATA = f"C:\\DATA\\data\\raw\\neuralTPP\\data\\Stack Overflow\\so\\baseline\\so\\split_{i_split+1}\\"
        print(ADD_DATA)
        with open(ADD_DATA+f'{seg}.json') as f:
            data = json.load(f)

        ids = np.concatenate( [np.ones(len(sample), dtype=int)*i for i,sample in enumerate(data)] )
        times = np.concatenate( [np.array( [ seq['time'] for seq in sample] ) for sample in data] )
        events = np.concatenate( [np.array( [ seq['labels'][0] for seq in sample] ) for sample in data] )

        df = pd.DataFrame()
        df['id']=ids
        df['time']=times/3600
        df['event']=events
        

        q = df['time']==0
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where time is 0")

        q = df['event'].isin(labels2remove)
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where label should be ignored")
        
        num_types = len(df['event'].unique())
        df2pkl(df, ADD_DATA,num_types)        
        


In [None]:
n_samples = len(data)
ids = np.concatenate( [np.ones(len(sample), dtype=int)*i for i,sample in enumerate(data)] )
times = np.concatenate( [np.array( [ seq['time'] for seq in sample] ) for sample in data] )
events = np.concatenate( [np.array( [ seq['labels'][0] for seq in sample] ) for sample in data] )

df = pd.DataFrame()
df['id']=ids
df['time']=times/3600
df['event']=events

# df['time_gap']=df['time'].apply(lambda x:np.diff(x))

# df['dict'] = df.apply(lambda x:[{'time_since_start':x.time[i], 'type_event':x.event[i]}for i in range(len(x.time))], axis=1)

df
# df['dict'].iloc[0]

In [None]:


go.Figure(data=[
    go.Bar(name='train', y=dict_seg['train']),
    go.Bar(name='test', y=dict_seg['test']),
    go.Bar(name='val', y=dict_seg['val'])

])

# Synthea Ear

In [None]:
ADD_DATA = "C:\\DATA\\data\\raw\\neuralTPP\\data\\Synthea - Ear infection\\ear_infection\\synthea\\ear_infection\\split_1\\"
with open(ADD_DATA+'train.json') as f:
    data = json.load(f)

n_seq = len(data)
n_seq
list_events=[]

for i in range(n_seq):
    temp = [event['labels'] for event in data[i]['events']]
    list_events.extend(temp)
all_events = np.concatenate(list_events)

label, count = np.unique(all_events,return_counts=True)


temp = np.round(count/count.sum()*100,2)

px.bar(temp)
label[temp<.25]

In [None]:
data[2]['events'][0]['labels']
data[2]['events'][1]['labels']
events =( [( [ seq['labels'] for seq in sample['events']] ) for sample in data] )
events

In [None]:
labels2remove = [ 2, 10, 13]

for seg in ['train']:
    for i_split in range(5):
        
        ADD_DATA = f"C:\\DATA\\data\\raw\\neuralTPP\\data\\Synthea - Ear infection\\ear_infection\\synthea\\ear_infection\\split_{i_split+1}\\"
        print(ADD_DATA)
        with open(ADD_DATA+f'{seg}.json') as f:
            data = json.load(f)

        ids = np.concatenate( [np.ones(len(sample['events']), dtype=int)*i for i,sample in enumerate(data)] )
        times = np.concatenate( [np.array( [ seq['time'] for seq in sample['events']] ) for sample in data] )
        events = np.concatenate( [np.array( [ seq['labels'] for seq in sample['events']] ) for sample in data], axis=1 )

        df = pd.DataFrame()
        df['id']=ids
        df['time']=times/3600
        df['event']=events
        

        q = df['time']==0
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where time is 0")

        q = df['event'].isin(labels2remove)
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where label should be ignored")
        
        num_types = len(df['event'].unique())
        df2pkl(df, ADD_DATA,num_types)        
        


# Synthea Full

In [16]:

ADD_DATA = "C:\\DATA\\data\\processed\\synthea_full\\split0\\"
ADD_DATA = "/scratch/hokarami/new/synthea_full/split0/"
with open(ADD_DATA+'train.json') as f:
    data = json.load(f)

n_samples = len(data)
n_samples
l_seqs = [len(sample['events']) for sample in data]


px.histogram(l_seqs, title='histogram of sequence length')

list_events=[]

for i in range(n_samples):
    temp = [event['labels'] for event in data[i]['events']]
    list_events.extend(temp)
all_events = np.concatenate(list_events)

label, count = np.unique(all_events,return_counts=True)


temp = np.round(count/count.sum()*100,2)

go.Figure(go.Bar(x=label,y=temp))

print('biggest values')

label[np.argsort(temp)][-50:]
temp[np.argsort(temp)][-50:]

# label[temp<.025]

7016

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

biggest values


array([144, 184, 233,  67,  96,   4, 118,  42, 295,   3, 153, 141, 132,
        78,  89,  59, 104, 155, 145,  63, 289, 108, 277, 124, 116,  81,
       189, 321, 112, 196, 110, 264,  77, 152, 165, 280, 337, 126, 107,
       149, 164, 202,  10, 307, 327, 197,  95, 296, 183, 195])

array([ 0.36,  0.37,  0.38,  0.39,  0.42,  0.43,  0.44,  0.46,  0.48,
        0.51,  0.53,  0.55,  0.55,  0.57,  0.58,  0.6 ,  0.64,  0.68,
        0.72,  0.72,  0.75,  0.76,  0.79,  0.94,  0.95,  1.02,  1.05,
        1.16,  1.39,  1.47,  1.62,  1.8 ,  1.89,  1.89,  2.11,  2.15,
        2.15,  2.19,  2.28,  2.28,  2.36,  2.54,  2.67,  3.01,  3.1 ,
        3.13,  3.63,  4.3 ,  9.61, 11.91])

In [None]:
len(data[2]['events'])
data[2]['events'][0]['labels']
data[2]['events'][1]['labels']
events =( [( [ seq['labels'] for seq in sample['events']] ) for sample in data] )
events

In [None]:
isub=1
ii=7

data[isub].keys()
data[isub]['events'][ii].keys()
data[isub]['events'][ii]['time']
data[isub]['events'][ii]['labels']
data[isub]['events'][ii]['encounter']
data[isub]['events'][ii]['conditions']
data[isub]['events'][ii]['medications']

[ x['metadata']['deathdate'] for x in data ]

In [17]:
labels2keep = [110, 264,  77, 152, 165, 280, 337, 126, 107, 149, 164, 202,  10,
                  307, 327, 197,  95, 296, 183, 195]


labels2keep = [144, 184, 233,  67,  96,   4, 118,  42, 295,   3, 153, 141, 132,
        78,  89,  59, 104, 155, 145,  63, 289, 108, 277, 124, 116,  81,
       189, 321, 112, 196, 110, 264,  77, 152, 165, 280, 337, 126, 107,
       149, 164, 202,  10, 307, 327, 197,  95, 296, 183, 195]


def keepLabel(x):
    # x is a list of integers
    x = [i for i in x if i in labels2keep]
    return x

def onehot(x,num_types):
    # x is a list
    one = np.zeros(num_types, dtype=int)
    one[x]=1
    return one



path_data_raw = "/scratch/hokarami/new/synthea_full/"



for seg in ['train','test','val']:
    for i_split in range(1):
        
        ADD_DATA = f"C:\\DATA\\data\\raw\\neuralTPP\\data\\Synthea - Full\\synthea\\all\\split_{i_split+1}\\"
        ADD_DATA = path_data_raw+f'split{i_split}/'
        print(path_data_raw+f'split{i_split}/')
        with open(ADD_DATA+f'{seg}.json') as f:
            data = json.load(f)

        # ids = np.concatenate( [np.ones(len(sample['events']), dtype=int)*i for i,sample in enumerate(data)] )
        # times = [[ seq['time'] for seq in sample['events']]  for sample in data] # *np.ones_like(seq['labels'])
        # events =[ [np.array( keepLabel(seq['labels'])) for seq in sample['events']] for sample in data]

        times = []
        events = []
        ids = []
        for i, sample in enumerate(data):
            for seq in sample['events']:
                temp = np.array( keepLabel(seq['labels']))
                events.append(temp)
                times.append(seq['time']*np.ones_like(temp, dtype=int))
                ids.append(np.ones_like(temp)*i)
        # some could be empty

        

        df = pd.DataFrame()
        df['id']=np.concatenate(ids).astype(int)
        df['time']=np.concatenate(times)/3600
        df['event']=np.concatenate(events).astype(int)
        

        q = df['time']==0
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where time is 0")

        # q = df['event'].isin(labels2remove)
        # df = df.drop(df[q].index)
        # print(f"{q.sum()} rows dropped where label should be ignored")
        
        num_types = len(df['event'].unique())
        num_marks = num_types
        
        dict_map_events = {k:i for i,k in enumerate(df['event'].unique())}
        df['event'] = df['event'].map(dict_map_events)

        df = df.groupby(['id','time'])['event'].apply(lambda x:np.array(x.tolist())).reset_index()
        df['event'] = df['event'].apply(lambda x: onehot(x,num_types) )


        temp = np.stack(df['event'].values)

        w_class = temp.sum(0) # [K]
        w_class = 1 / w_class
        w_class = w_class/w_class.sum()

        n_pos = temp.sum(0) # [K]
        n_neg = temp.shape[0] - temp.sum(0) # [K]
        pos_weight = n_neg/n_pos
        pos_weight[pos_weight>50]=50
        
        print(f"w_class is {w_class}")
        print(f"pos_weight is {pos_weight}")

        df2pkl(df, ADD_DATA,num_types, num_marks=num_marks, dict_map_events=dict_map_events, w_class=w_class, pos_weight=pos_weight)        
        


/scratch/hokarami/new/synthea_full/split0/
5 rows dropped where time is 0
w_class is [0.00397384 0.00143509 0.01680849 0.01477256 0.00567699 0.00749252
 0.00809188 0.00671235 0.04678874 0.02148781 0.00792639 0.00792639
 0.02983976 0.00550103 0.01800322 0.00177783 0.00949937 0.01053455
 0.00545277 0.01174523 0.02385853 0.02525398 0.02385853 0.04079551
 0.0393363  0.03329082 0.02238501 0.02291115 0.0310345  0.04716607
 0.02975694 0.04417062 0.03533893 0.04467675 0.03691022 0.03844309
 0.00474618 0.00787111 0.00733366 0.0310945  0.01240542 0.00756076
 0.00921764 0.00921764 0.01626457 0.03282373 0.02701428 0.02900564
 0.01837604 0.00643506]
pos_weight is [ 7.84181105  2.19308714 36.39895493 31.86900115 11.63132583 15.6708787
 17.00446513 13.93499922 50.         46.81045424 16.63623483 16.63623483
 50.         11.23980333 39.05722681  2.95568851 20.1361388  22.43941379
 11.13243209 25.1331812  50.         50.         50.         50.
 50.         50.         48.80671538 49.97738604 50.      

In [None]:
data[0]['events'][0]
data[0]['events'][1]
data[0]['events'][2]
data[0]['events'][3]

In [10]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
X, y = make_multilabel_classification(random_state=0)
inner_clf = LogisticRegression(solver="liblinear", random_state=0)
clf = MultiOutputClassifier(inner_clf).fit(X, y)
y_score = np.transpose([y_pred[:, 1] for y_pred in clf.predict_proba(X)])
roc_auc_score(y, y_score, average=None)
y.shape
y_score.shape

y[:5]

array([0.82664884, 0.86034414, 0.94181818, 0.8502652 , 0.94809095])

(100, 5)

(100, 5)

array([[0, 0, 1, 1, 1],
       [0, 0, 1, 0, 0],
       [1, 1, 0, 1, 0],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 0, 0]])

# MIMIC II

In [None]:
ADD_DATA = "C:\\DATA\\data\\raw\\neuralTPP\\data\\Mimic-II\\mimic2\\baseline\\mimic2\\split_1\\"
with open(ADD_DATA+'train.json') as f:
    data = json.load(f)

n_seq = len(data)
n_seq
list_events=[]

for i in range(n_seq):
    temp = [event['labels'] for event in data[i]]
    list_events.extend(temp)
all_events = np.concatenate(list_events)

label, count = np.unique(all_events,return_counts=True)

# label
# count
temp = np.round(count/count.sum()*100,2)

px.bar(temp)
label[temp<.1]

# 

In [None]:
ADD_DATA = "C:\\DATA\\data\\raw\\neuralTPP\\data\\Retweets\\retweets\\baseline\\retweets\\split_1\\"
with open(ADD_DATA+'train.json') as f:
    data = json.load(f)

n_seq = len(data)
n_seq
list_events=[]

for i in range(n_seq):
    temp = [event['labels'] for event in data[i]]
    list_events.extend(temp)
all_events = np.concatenate(list_events)

label, count = np.unique(all_events,return_counts=True)

label
count
temp = np.round(count/count.sum()*100,2)
temp
px.bar(temp)

# ReTweet

In [None]:

ADD_DATA = "C:\\DATA\\data\\processed\\synthea_full\\split0\\"
ADD_DATA = "/scratch/hokarami/new/retweets/split0/"
with open(ADD_DATA+'train.json') as f:
    data = json.load(f)

n_samples = len(data)
n_samples
l_seqs = [len(sample) for sample in data]


px.histogram(l_seqs, title='histogram of sequence length')

list_events=[]
list_times=[]
for i in range(n_samples):
    temp = [event['labels'] for event in data[i]]
    list_events.extend(temp)

    temp = [event['time'] for event in data[i]]
    list_times.extend(temp)
all_events = np.concatenate(list_events)

label, count = np.unique(all_events,return_counts=True)


temp = np.round(count/count.sum()*100,2)

go.Figure(go.Bar(x=label,y=temp))

print('biggest values')

label[np.argsort(temp)][-50:]
temp[np.argsort(temp)][-50:]

# label[temp<.025]

In [None]:
labels2remove = []
path_data_raw = "/scratch/hokarami/new/retweets/"



for seg in ['train','test','val']:
    for i_split in range(5):
        
        ADD_DATA = f"C:\\DATA\\data\\raw\\neuralTPP\\data\\Stack Overflow\\so\\baseline\\so\\split_{i_split+1}\\"
        ADD_DATA = path_data_raw+f'split{i_split}/'
        print(ADD_DATA)
        with open(ADD_DATA+f'{seg}.json') as f:
            data = json.load(f)

        ids = np.concatenate( [np.ones(len(sample), dtype=int)*i for i,sample in enumerate(data)] )
        times = np.concatenate( [np.array( [ seq['time'] for seq in sample] ) for sample in data] )
        events = np.concatenate( [np.array( [ seq['labels'][0] for seq in sample] ) for sample in data] )

        df = pd.DataFrame()
        df['id']=ids
        df['time']=times/3600
        df['event']=events
        

        q = df['time']==0
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where time is 0")

        q = df['event'].isin(labels2remove)
        df = df.drop(df[q].index)
        print(f"{q.sum()} rows dropped where label should be ignored")
        
        num_types = len(df['event'].unique())
        df2pkl(df, ADD_DATA,num_types)        
        
