In [1]:
import pandas as pd
import numpy as np
import ast
import os
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [3]:
def get_idx_df(target_df, idx_list):
    idx_df = target_df.iloc[idx_list]
    return idx_df

In [14]:
def get_feat_indices(feature):
    repeat_feature_df = pd.DataFrame()
    for i, row in terminated_df.iterrows():
        traj = ast.literal_eval(row['trajectory'])
        freq = traj.count(feature)
        if freq > 2:
            repeat_feature_df = repeat_feature_df.append(row)
    return repeat_feature_df.index

#### X_test

In [4]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
class_dict

{'No anemia': 0,
 'Hemolytic anemia': 1,
 'Aplastic anemia': 2,
 'Iron deficiency anemia': 3,
 'Vitamin B12/Folate deficiency anemia': 4,
 'Anemia of chronic disease': 5}

In [5]:
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
test_set.shape

(11515, 7)

In [43]:
test_set.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1
1,5.847005,0.0,0.364588,0.0,0.0,87.13616,2
2,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1
3,15.224254,0.0,0.0,0.0,361.625413,0.0,0
4,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2


#### test_df

In [7]:
#test_df = pd.read_csv('test_dfs/test_df_with_hb_1e6.csv')
test_df = pd.read_csv('test_dfs/test_df_with_hb_some_nans_2e6.csv')
test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
1,4.0,1.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0
2,4.0,2.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
3,2.0,3.0,1.0,2.0,0.0,"['hemoglobin', 'No anemia']",0.0,0.0
4,4.0,4.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0


#### terminated episodes

In [8]:
terminated_df = test_df[test_df.y_pred.isna()]
terminated_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
30,8.0,30.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
81,8.0,81.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
92,8.0,92.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",5.0,
183,8.0,183.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",5.0,
191,8.0,191.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,


#### Hemoglobin

In [15]:
hb_idx_list = get_feat_indices('hemoglobin')
hb_idx_list

Int64Index([7621, 9997], dtype='int64')

In [17]:
repeat_hb_df = get_idx_df(test_df, hb_idx_list)
repeat_hb_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
7621,8.0,7621.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'hem...",5.0,
9997,8.0,9997.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'hem...",5.0,


In [18]:
repeat_hb_test_set = get_idx_df(test_set, hb_idx_list)
repeat_hb_test_set

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
7621,6.518625,553.613414,0.0,0.0,326.514726,78.783691,5
9997,4.488353,544.525924,0.0,0.0,239.065577,75.76024,5


In [20]:
repeated_hb_df.iloc[1]['trajectory']

"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'hemoglobin', 'hemoglobin', 'hemoglobin', 'hemoglobin']"

In [22]:
repeat_hb_test_set.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,5.503489,549.069669,0.0,0.0,282.790151,77.271965,5.0
std,1.43562,6.425826,0.0,0.0,61.835886,2.137902,0.0
min,4.488353,544.525924,0.0,0.0,239.065577,75.76024,5.0
25%,4.995921,546.797797,0.0,0.0,260.927864,76.516103,5.0
50%,5.503489,549.069669,0.0,0.0,282.790151,77.271965,5.0
75%,6.011057,551.341542,0.0,0.0,304.652438,78.027828,5.0
max,6.518625,553.613414,0.0,0.0,326.514726,78.783691,5.0


#### Reticulocyte count

In [23]:
ret_idx_list = get_feat_indices('ret_count')
ret_idx_list

Int64Index([   81,    92,   183,   191,   206,   260,   284,   341,   404,
              407,
            ...
            11212, 11239, 11242, 11284, 11303, 11307, 11353, 11374, 11412,
            11457],
           dtype='int64', length=379)

In [24]:
repeat_ret_df = get_idx_df(test_df, ret_idx_list)
repeat_ret_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
81,8.0,81.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
92,8.0,92.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",5.0,
183,8.0,183.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",5.0,
191,8.0,191.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
206,8.0,206.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
...,...,...,...,...,...,...,...,...
11307,8.0,11307.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
11353,8.0,11353.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
11374,8.0,11374.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
11412,8.0,11412.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,


In [50]:
repeat_ret_test_set = get_idx_df(test_set, ret_idx_list)
repeat_ret_test_set

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
81,5.204281,4.799012,0.0,0.0,163.541988,75.774616,3
92,2.500476,5553.137234,0.0,0.0,253.657564,78.879349,5
183,11.411283,3878.278356,0.0,0.0,190.360511,79.667771,5
191,9.422081,6.293629,0.0,0.0,246.411440,77.644708,3
206,5.702470,6.713796,0.0,0.0,242.904336,78.319672,3
...,...,...,...,...,...,...,...
11307,10.192418,1.943551,0.0,0.0,313.762131,72.748115,3
11353,8.547071,4.285353,0.0,0.0,174.662728,67.844411,3
11374,9.326588,4.344287,0.0,0.0,198.880182,70.066818,3
11412,7.436377,7.434780,0.0,0.0,215.275348,68.816177,3


In [51]:
repeat_ret_test_set.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,379.0,379.0,379.0,379.0,379.0,379.0,379.0
mean,6.43108,576.687423,2.1e-05,0.018158,237.298874,77.607676,3.453826
std,3.23028,1412.026847,0.000212,0.169663,99.577096,4.882633,0.854403
min,1.052845,0.0,0.0,0.0,0.0,65.687098,2.0
25%,3.607367,3.496301,0.0,0.0,176.514199,75.202443,3.0
50%,6.232676,5.580898,0.0,0.0,246.41144,78.035577,3.0
75%,9.426724,7.928593,0.0,0.0,303.393995,79.465265,3.0
max,11.993901,9513.295275,0.00339,2.250333,517.448868,109.345932,5.0


In [84]:
repeat_ret_df_3 = repeat_ret_df[repeat_ret_df.y_actual==2]
repeat_ret_test_set_3 = get_idx_df(test_set, repeat_ret_df_3.index)

In [85]:
repeat_ret_df_3['trajectory'].unique()

array(["['hemoglobin', 'mcv', 'ret_count', 'ret_count', 'ret_count', 'ret_count', 'ret_count', 'ret_count']"],
      dtype=object)

In [86]:
a = repeat_ret_df_3[repeat_ret_df_3.trajectory == "['hemoglobin', 'mcv', 'ret_count', 'ret_count', 'ret_count', 'ret_count', 'ret_count', 'ret_count']"]
a

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
801,8.0,801.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,
972,8.0,972.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,
5508,8.0,5508.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,
6364,8.0,6364.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,
6947,8.0,6947.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,
8162,8.0,8162.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",2.0,


In [87]:
a_test_set = get_idx_df(test_set, a.index)
a_test_set.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
801,10.426449,0.0,0.00339,0.0,0.0,81.527283,2
972,2.633483,0.0,0.001094,0.0,0.0,87.426538,2
5508,10.269576,26.612355,0.000646,0.0,0.0,92.001897,2
6364,1.107369,0.0,1e-05,0.0,0.0,85.948206,2
6947,8.662935,0.0,0.00162,0.0,0.0,86.170668,2


In [88]:
a_test_set.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,6.311835,18.199831,0.001327,0.0,0.0,86.993937,2.0
std,4.027698,33.290737,0.00115,0.0,0.0,3.478903,0.0
min,1.107369,0.0,1e-05,0.0,0.0,81.527283,2.0
25%,3.167911,0.0,0.000758,0.0,0.0,86.003821,2.0
50%,6.717065,0.0,0.001147,0.0,0.0,86.798603,2.0
75%,9.867916,19.959266,0.001516,0.0,0.0,88.523408,2.0
max,10.426449,82.586629,0.00339,0.0,0.0,92.001897,2.0


In [36]:
repeat_ret_df[repeat_ret_df]

3.0    280
5.0     85
4.0      8
2.0      6
Name: y_actual, dtype: int64

In [68]:
ret_idx_list_3 = repeat_ret_df[repeat_ret_df.y_actual ==3].index

In [69]:
repeat_ret_test_set.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,379.0,379.0,379.0,379.0,379.0,379.0,379.0
mean,6.43108,576.687423,2.1e-05,0.018158,237.298874,77.607676,3.453826
std,3.23028,1412.026847,0.000212,0.169663,99.577096,4.882633,0.854403
min,1.052845,0.0,0.0,0.0,0.0,65.687098,2.0
25%,3.607367,3.496301,0.0,0.0,176.514199,75.202443,3.0
50%,6.232676,5.580898,0.0,0.0,246.41144,78.035577,3.0
75%,9.426724,7.928593,0.0,0.0,303.393995,79.465265,3.0
max,11.993901,9513.295275,0.00339,2.250333,517.448868,109.345932,5.0


#### segmented neutrophils

In [89]:
neutrophils_idx_list = get_feat_indices('segmented_neutrophils')
neutrophils_idx_list

Int64Index([   30,   994,  1015,  1071,  1200,  1307,  1385,  1511,  1877,
             1945,  1947,  2007,  2048,  2137,  2236,  2510,  2630,  2778,
             3181,  3248,  3283,  3361,  3435,  3474,  3579,  3608,  3648,
             3704,  3913,  4244,  4530,  4701,  4778,  4955,  5083,  5101,
             5174,  5255,  5261,  5289,  5293,  5310,  5313,  5456,  5764,
             5824,  6057,  6113,  6481,  6517,  6649,  6999,  7005,  7011,
             7090,  7611,  7615,  8138,  8190,  8383,  8403,  8424,  8522,
             8607,  8879,  9113,  9123,  9148,  9288,  9292,  9322,  9468,
             9764, 10072, 10509, 10690, 11055, 11117, 11120, 11285, 11389],
           dtype='int64')

In [90]:
repeat_neutrophils_df = get_idx_df(test_df, neutrophils_idx_list)
repeat_neutrophils_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
30,8.0,30.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
994,8.0,994.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
1015,8.0,1015.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",2.0,
1071,8.0,1071.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
1200,8.0,1200.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
...,...,...,...,...,...,...,...,...
11055,8.0,11055.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
11117,8.0,11117.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",2.0,
11120,8.0,11120.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,
11285,8.0,11285.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",1.0,


In [92]:
repeat_neutrophils_df.y_actual.value_counts()

1.0    44
2.0    35
4.0     2
Name: y_actual, dtype: int64

In [109]:
repeat_neutrophils_df_3 = repeat_neutrophils_df[repeat_neutrophils_df.y_actual==4]
repeat_neutrophils_test_set_3 = get_idx_df(test_set, repeat_neutrophils_df_3.index)

In [110]:
repeat_neutrophils_df_3['trajectory'].unique()

array(["['hemoglobin', 'mcv', 'segmented_neutrophils', 'segmented_neutrophils', 'segmented_neutrophils', 'segmented_neutrophils', 'segmented_neutrophils', 'segmented_neutrophils']"],
      dtype=object)

In [111]:
a_test_set = get_idx_df(test_set, repeat_neutrophils_df_3.index)
a_test_set.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
2510,7.052314,0.0,0.0,0.01162,0.0,106.570625,4
9288,9.612852,0.0,0.0,0.011007,0.0,105.984704,4


In [112]:
a_test_set.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,8.332583,0.0,0.0,0.011313,0.0,106.277665,4.0
std,1.810574,0.0,0.0,0.000433,0.0,0.414308,0.0
min,7.052314,0.0,0.0,0.011007,0.0,105.984704,4.0
25%,7.692449,0.0,0.0,0.01116,0.0,106.131184,4.0
50%,8.332583,0.0,0.0,0.011313,0.0,106.277665,4.0
75%,8.972718,0.0,0.0,0.011466,0.0,106.424145,4.0
max,9.612852,0.0,0.0,0.01162,0.0,106.570625,4.0


In [108]:
neutrophils_idx_list_3 = repeat_neutrophils_df[repeat_neutrophils_df.y_actual ==3].index

In [None]:
repeat_neutrophils_test_set.describe()