In [1]:
import pandas as pd
import numpy as np
import ast
import os
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [26]:
def get_idx_df(target_df, idx_list):
    idx_df = target_df.iloc[idx_list]
    return idx_df

#### X_test

In [3]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
class_dict

{'No anemia': 0,
 'Hemolytic anemia': 1,
 'Aplastic anemia': 2,
 'Iron deficiency anemia': 3,
 'Vitamin B12/Folate deficiency anemia': 4,
 'Anemia of chronic disease': 5}

In [4]:
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
test_set = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
test_set.shape

(11515, 7)

In [5]:
test_set.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1
1,5.847005,0.0,0.364588,0.0,0.0,87.13616,2
2,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1
3,15.224254,0.0,0.0,0.0,361.625413,0.0,0
4,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2


#### test_df

In [6]:
#test_df = pd.read_csv('test_dfs/test_df_with_hb_1e6.csv')
test_df = pd.read_csv('test_dfs/test_df_with_hb_some_nans_2e6.csv')
test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
1,4.0,1.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0
2,4.0,2.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
3,2.0,3.0,1.0,2.0,0.0,"['hemoglobin', 'No anemia']",0.0,0.0
4,4.0,4.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0


In [7]:
len(test_df)

11515

In [8]:
test_df.y_pred.value_counts()

1.0    4179
0.0    3009
2.0    2860
4.0     453
5.0     437
3.0     104
Name: y_pred, dtype: int64

#### 0 - No anemia

In [36]:
no_df = test_df[test_df.y_pred == 0]
misdiag_no = no_df[no_df.y_actual!= no_df.y_pred]
misdiag_no

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
272,3.0,272.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
328,3.0,328.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
2460,3.0,2460.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
2760,3.0,2760.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
5167,5.0,5167.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",5.0,0.0
5660,3.0,5660.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
6022,3.0,6022.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
6037,3.0,6037.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
7255,3.0,7255.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0


In [41]:
misdiag_no.y_actual.value_counts()

1.0    5
2.0    3
5.0    1
Name: y_actual, dtype: int64

In [48]:
test_set.iloc[5167]

hemoglobin                 3.891606
ferritin                  49.305252
ret_count                  1.648161
segmented_neutrophils      0.000000
tibc                     240.471361
mcv                       77.004147
label                      5.000000
Name: 5167, dtype: float64

In [47]:
get_idx_df(test_set, misdiag_no[misdiag_no.y_actual==2].index).describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,11.989757,88.065696,1.143886,0.0,127.481198,84.563698,2.0
std,0.003653,142.228876,0.803376,0.0,220.803911,3.996552,0.0
min,11.986292,0.0,0.217312,0.0,0.0,81.756138,2.0
25%,11.987849,6.023489,0.892813,0.0,0.0,82.275883,2.0
50%,11.989406,12.046979,1.568313,0.0,0.0,82.795629,2.0
75%,11.991489,132.098543,1.607172,0.0,191.221796,85.967478,2.0
max,11.993573,252.150108,1.646032,0.0,382.443593,89.139328,2.0


In [40]:
get_idx_df(test_set, misdiag_no.index).describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,11.09117,79.25843,2.74898,0.0,115.575875,85.623341,1.777778
std,2.699841,105.799536,1.726308,0.0,179.576603,5.13576,1.301708
min,3.891606,0.0,0.217312,0.0,0.0,77.004147,1.0
25%,11.986292,0.0,1.646032,0.0,0.0,81.756138,1.0
50%,11.989406,12.046979,2.126409,0.0,0.0,85.43592,1.0
75%,11.99426,199.610649,3.876433,0.0,240.471361,90.29968,2.0
max,11.999148,252.150108,5.611881,0.0,417.267923,91.797709,5.0


In [50]:
analytic_no_df = pd.DataFrame()
for i, row in no_df.iterrows():
    traj = ast.literal_eval(row['trajectory'])
    if 'mcv' in traj:
        analytic_no_df = analytic_no_df.append(row)
analytic_no_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
272,3.0,272.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
328,3.0,328.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
2460,3.0,2460.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
2760,3.0,2760.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
5167,5.0,5167.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",5.0,0.0
5660,3.0,5660.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
6022,3.0,6022.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0
6037,3.0,6037.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",2.0,0.0
7255,3.0,7255.0,0.0,1.0,0.0,"['hemoglobin', 'mcv', 'No anemia']",1.0,0.0


In [51]:
get_idx_df(test_set, analytic_no_df.index)

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
272,11.999148,0.0,3.519355,0.0,0.0,91.797709,1
328,11.986292,252.150108,0.217312,0.0,382.443593,81.756138,2
2460,11.99426,199.610649,3.876433,0.0,0.0,81.677757,1
2760,11.994334,0.0,4.526924,0.0,0.0,90.29968,1
5167,3.891606,49.305252,1.648161,0.0,240.471361,77.004147,5
5660,11.989406,12.046979,1.646032,0.0,0.0,82.795629,2
6022,11.98309,0.0,5.611881,0.0,417.267923,85.43592,1
6037,11.993573,0.0,1.568313,0.0,0.0,89.139328,2
7255,11.98882,200.212879,2.126409,0.0,0.0,90.703758,1


#### 1 - Hemolytic anemia

In [9]:
hemolytic_df = test_df[test_df.y_pred == 1]

In [13]:
hemolytic_df[hemolytic_df.y_actual!= hemolytic_df.y_pred]

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
12,4.0,12.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",5.0,1.0
462,4.0,462.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
678,4.0,678.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",3.0,1.0
939,5.0,939.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Hem...",5.0,1.0
1699,4.0,1699.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
2035,4.0,2035.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",5.0,1.0
2403,5.0,2403.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Hem...",5.0,1.0
3066,4.0,3066.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",5.0,1.0
3259,4.0,3259.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",5.0,1.0
3320,4.0,3320.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0


In [15]:
analytic_hem_df = pd.DataFrame()
for i, row in hemolytic_df.iterrows():
    traj = ast.literal_eval(row['trajectory'])
    if 'ret_count' not in traj:
        analytic_hem_df = analytic_hem_df.append(row)
analytic_hem_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
462,4.0,462.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
939,5.0,939.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Hem...",5.0,1.0
1699,4.0,1699.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
2403,5.0,2403.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Hem...",5.0,1.0
3320,4.0,3320.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
5829,4.0,5829.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
6009,4.0,6009.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Hemolytic a...",5.0,1.0
7710,5.0,7710.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",4.0,1.0
10552,5.0,10552.0,0.0,3.0,0.0,"['hemoglobin', 'mcv', 'segmented_neutrophils',...",4.0,1.0


#### 2 - Aplastic anemia

In [22]:
aplastic_df = test_df[test_df.y_pred == 2]
misdiag_aplas = aplastic_df[aplastic_df.y_actual!= aplastic_df.y_pred]
misdiag_aplas

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
162,4.0,162.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",5.0,2.0
302,4.0,302.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
742,4.0,742.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
775,4.0,775.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
829,4.0,829.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",5.0,2.0
...,...,...,...,...,...,...,...,...
10888,4.0,10888.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
11022,4.0,11022.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
11111,4.0,11111.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0
11178,4.0,11178.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0


In [33]:
misdiag_aplas_3 = get_idx_df(test_set, misdiag_aplas[misdiag_aplas.y_actual==3].index)
misdiag_aplas_3

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
857,4.647758,5.615222,1.404479,0.0,165.674843,79.768656,3
1115,1.984988,6.081102,1.652798,0.0,234.024666,79.836273,3
1486,1.837392,2.668882,1.387516,0.0,138.621625,79.862013,3
1549,2.156365,6.252593,1.02285,0.0,282.516163,79.782234,3
3653,6.775466,1.647538,1.917517,0.0,333.712841,79.473182,3
4050,3.315839,5.939285,1.264272,0.0,410.405612,79.46998,3
4204,1.867885,6.060087,1.160173,0.0,287.221606,78.805047,3
4873,8.171443,4.670497,1.771352,0.0,224.805605,79.767088,3
5512,2.757855,2.879461,0.987891,0.0,289.971768,79.618685,3
5958,1.833193,2.216234,1.145684,0.0,243.071911,78.632129,3


In [34]:
misdiag_aplas_3.describe()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,4.681927,4.26257,1.405005,0.0,262.084209,79.441526,3.0
std,3.026437,1.985578,0.291223,0.0,76.207427,0.393322,0.0
min,1.833193,1.647538,0.987891,0.0,138.621625,78.632129,3.0
25%,2.027833,2.523923,1.186198,0.0,227.110371,79.252039,3.0
50%,3.847087,4.166935,1.352682,0.0,280.327168,79.540484,3.0
75%,6.658961,6.029887,1.652777,0.0,301.652275,79.768264,3.0
max,11.327348,7.619657,1.917517,0.0,410.405612,79.862013,3.0


In [25]:
misdiag_aplas[misdiag_aplas.y_actual==1].index

Int64Index([  302,   742,   775,   879,  1162,  1459,  1926,  2189,  2946,
             3165,  3726,  4868,  5819,  5873,  6174,  6392,  6703,  6927,
             7207,  8001,  8020,  8775,  8947,  8974,  9274,  9427,  9539,
             9712, 10343, 10601, 10667, 10783, 10796, 10888, 11022, 11111,
            11178],
           dtype='int64')

In [19]:
analytic_aplas_df = pd.DataFrame()
for i, row in aplastic_df.iterrows():
    traj = ast.literal_eval(row['trajectory'])
    if 'ferritin' in traj:
        analytic_aplas_df = analytic_aplas_df.append(row)
analytic_aplas_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
9372,4.0,9372.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Aplastic an...",5.0,2.0
9924,4.0,9924.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'Aplastic an...",5.0,2.0


In [21]:
X_test.iloc[9924]

hemoglobin                 6.547545
ferritin                  85.026614
ret_count                  0.000000
segmented_neutrophils      0.000000
tibc                     350.518561
mcv                       77.765003
Name: 9531, dtype: float64