In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

#### The data used

In [3]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26868, 6), (11515, 6), (26868,), (11515,))

In [4]:
test_df = pd.read_csv('test_dfs/test_df_with_hb_some_nans_2e6.csv')
test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
1,4.0,1.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0
2,4.0,2.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
3,2.0,3.0,1.0,2.0,0.0,"['hemoglobin', 'No anemia']",0.0,0.0
4,4.0,4.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0


In [37]:
test_df.y_pred.isna().sum()/len(test_df)*100

4.107685627442467

In [5]:
def get_min_of_array(array, n):
    '''The n least elements in an array'''
    idx = np.argpartition(array, n)
    min_vals = array[idx[:n]]
    return min_vals

In [6]:
X_test_df = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1)
X_test_df.columns = df.columns
X_test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1
1,5.847005,0.0,0.364588,0.0,0.0,87.13616,2
2,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1
3,15.224254,0.0,0.0,0.0,361.625413,0.0,0
4,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2


In [7]:
a1 = np.array(test_df['y_actual'], dtype=np.float32)
a2 = np.array(X_test_df['label'], dtype=np.float32)
assert np.array_equal(a1, a2)

#### 0 - Hemolytic Anemia

In [45]:
hemolytic_test_df = test_df[test_df.y_pred == 0]
hemolytic_x_test_df = X_test_df[X_test_df.label== 0]

In [48]:
a1 = hemolytic_test_df[hemolytic_test_df.y_actual != hemolytic_test_df.y_pred]
a1.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
33,5.0,33.0,0.0,3.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'mcv', '...",2.0,0.0
253,5.0,253.0,0.0,3.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'mcv', '...",2.0,0.0
601,5.0,601.0,0.0,3.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'mcv', '...",2.0,0.0
798,5.0,798.0,0.0,3.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'mcv', '...",2.0,0.0
1174,5.0,1174.0,0.0,3.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'mcv', '...",2.0,0.0


In [55]:
a1_index_list = list(a1.index)
a1_x_test_df = X_test_df.iloc[a1_index_list]
len(a1_x_test_df)

62

In [60]:
a1_x_test_df.describe()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
count,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0
mean,0.0,1.901234,0.0,0.0,0.0,3.179949,88.09072,29.816013,2.0
std,0.0,0.351053,0.0,0.0,0.0,0.817971,4.156499,8.790087,0.0
min,0.0,0.006229,0.0,0.0,0.0,1.696806,80.60231,15.525618,2.0
25%,0.0,1.95236,0.0,0.0,0.0,2.483316,85.103664,23.066521,2.0
50%,0.0,1.973896,0.0,0.0,0.0,3.132505,88.698947,28.316785,2.0
75%,0.0,1.987303,0.0,0.0,0.0,3.786011,90.663813,35.495148,2.0
max,0.0,1.999754,0.0,0.0,0.0,5.191569,98.037468,54.087862,2.0


#### 1 - Anemia of chronic disease

In [32]:
acd_test_df = test_df[test_df.y_actual == 5]
acd_x_test_df = X_test_df[X_test_df.label== 5]
combined_acd_test_df = pd.concat([acd_test_df, acd_x_test_df], axis=1)
combined_acd_test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
12,4.0,12.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",5.0,1.0,11.575313,6328.062304,2.158977,0.0,162.929066,79.821483,5
33,5.0,33.0,1.0,5.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Ane...",5.0,5.0,6.39126,2644.50631,0.0,0.0,206.124879,72.76485,5
72,5.0,72.0,1.0,5.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Ane...",5.0,5.0,1.061561,637.533935,0.0,0.0,327.209061,78.02008,5
92,8.0,92.0,0.0,-2.0,1.0,"['hemoglobin', 'mcv', 'ret_count', 'ret_count'...",5.0,,2.500476,5553.137234,0.0,0.0,253.657564,78.879349,5
101,5.0,101.0,1.0,5.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'tibc', 'Ane...",5.0,5.0,2.85105,2784.776219,1.234847,0.0,394.671148,76.839887,5


In [34]:
acd_hem_pred_df = combined_acd_test_df[(combined_acd_test_df.y_pred==2)]
acd_hem_pred_df.mcv.mean()

79.42005904186286

In [20]:
#rows with tibc in their trajectory
tibc_acd_df = pd.DataFrame()
count=0
for i, row in acd_test_df.iterrows():
    try:
        traj = ast.literal_eval(row.trajectory)
    except:
        traj = row.trajectory
    if 'tibc' in traj:
        tibc_acd_df = tibc_acd_df.append(row)
tibc_acd_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
6523,7.0,6523.0,0.0,5.0,0.0,"['mentzer_index', 'rbc', 'iron', 'mcv', 'ferri...",3.0,1.0


In [44]:
X_test_df.loc[6523]

ferritin                   1.774338
ret_count                  0.000000
segmented_neutrophils      0.000000
iron                     115.251457
tibc                     312.645199
rbc                        4.138273
mcv                       74.123819
mentzer_index             17.911776
label                      3.000000
Name: 6523, dtype: float64

#### 2- Aplastic Anemia

In [61]:
aplastic_test_df = test_df[test_df.y_pred == 2]
aplastic_x_test_df = X_test_df[X_test_df.label== 2]

In [63]:
#rows with tibc in their trajectory
neutrophils_aplastic_df = pd.DataFrame()
count=0
for i, row in aplastic_test_df.iterrows():
    try:
        traj = ast.literal_eval(row.trajectory)
    except:
        traj = row.trajectory
    if 'segmented_neutrophils' in traj:
        neutrophils_aplastic_df = neutrophils_aplastic_df.append(row)
neutrophils_aplastic_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
4590,4.0,4590.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'segmented_neutrophil...",4.0,2.0
6627,4.0,6627.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'segmented_neutrophil...",4.0,2.0


In [64]:
a2 = aplastic_test_df[aplastic_test_df.y_actual != aplastic_test_df.y_pred]
a2.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
2406,4.0,2406.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'Aplasti...",0.0,2.0
4307,4.0,4307.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'ret_count', 'Aplasti...",0.0,2.0
4590,4.0,4590.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'segmented_neutrophil...",4.0,2.0
6627,4.0,6627.0,0.0,2.0,0.0,"['mentzer_index', 'rbc', 'segmented_neutrophil...",4.0,2.0


In [65]:
a2_index_list = list(a2.index)
a2_x_test_df = X_test_df.iloc[a2_index_list]
a2_x_test_df

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
2406,0.0,2.00596,0.0,0.0,0.0,5.22986,90.839971,17.369485,0
4307,0.0,2.034032,0.0,0.0,0.0,5.631821,99.033033,17.584549,0
4590,0.0,0.0,0.056872,0.0,0.0,3.80824,102.922809,27.026343,4
6627,0.0,0.0,0.044406,0.0,0.0,3.971057,104.236046,26.248945,4


In [67]:
X_test_df.describe()

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
count,8519.0,8519.0,8519.0,8519.0,8519.0,8519.0,8519.0,8519.0,8519.0
mean,206.44749,2.384897,0.107114,6.425037,27.385687,3.517365,89.149873,26.965117,1.097312
std,956.680755,2.252126,0.559969,21.872109,81.948931,0.808308,7.464055,8.03528,1.23169
min,0.0,0.0,0.0,0.0,0.0,0.664613,59.569175,12.415871,0.0
25%,0.0,0.258182,0.0,0.0,0.0,2.973509,84.166662,21.665629,0.0
50%,0.0,1.987357,0.0,0.0,0.0,3.522851,89.346789,25.295998,1.0
75%,0.0,3.910286,0.0,0.0,0.0,4.061052,94.316578,30.322435,2.0
max,10820.137808,12.598495,7.289302,202.702735,573.833319,6.35275,116.700292,129.999157,5.0


#### 3 - Iron deficiency anemia 

There are a total of 121 IDA test samples. Of these, 3 are diagnosed as hemolytic anemia while 18 are diagnosed as having aplastic anemia 

In [38]:
#ida_test_df = test_df[test_df.y_pred == 3]
ida_test_df = test_df[test_df.y_actual == 3]
ida_x_test_df = X_test_df[X_test_df.label== 3]
combined_ida_test_df = pd.concat([ida_x_test_df, ida_test_df], axis=1)
combined_ida_test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
81,5.204281,4.799012,0.0,0.0,163.541988,75.774616,3,8.0,81.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
191,9.422081,6.293629,0.0,0.0,246.41144,77.644708,3,8.0,191.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
206,5.70247,6.713796,0.0,0.0,242.904336,78.319672,3,8.0,206.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,
268,8.741973,5.243427,0.58045,0.0,291.364377,76.053946,3,6.0,268.0,1.0,6.0,0.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,3.0
284,11.606463,3.271495,0.0,0.0,278.757066,76.910957,3,8.0,284.0,0.0,0.0,1.0,"['hemoglobin', 'mcv', 'ferritin', 'ret_count',...",3.0,


In [40]:
combined_ida_test_df.mcv.mean()

75.95405101094063

In [41]:
#Looking at the 3 that are diagnoses as hemolytic
ida_hem_pred_df = combined_ida_test_df[(combined_ida_test_df.y_pred==1)|(combined_ida_test_df.y_pred==2)]
ida_hem_pred_df.mcv.mean()

79.46021283637137

In [25]:
len(ida_hem_pred_df)

118

In [9]:
a3 = ida_test_df[ida_test_df.y_actual != ida_test_df.y_pred]
a3.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred


In [70]:
#rows with tibc in their trajectory
tibc_ida_df = pd.DataFrame()
count=0
for i, row in ida_test_df.iterrows():
    try:
        traj = ast.literal_eval(row.trajectory)
    except:
        traj = row.trajectory
    if 'tibc' in traj:
        tibc_ida_df = tibc_ida_df.append(row)
tibc_ida_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
3476,7.0,3476.0,1.0,7.0,0.0,"['mentzer_index', 'rbc', 'iron', 'mcv', 'ferri...",3.0,3.0
3553,7.0,3553.0,1.0,7.0,0.0,"['mentzer_index', 'rbc', 'iron', 'mcv', 'ferri...",3.0,3.0
3956,7.0,3956.0,1.0,7.0,0.0,"['mentzer_index', 'rbc', 'iron', 'mcv', 'ferri...",3.0,3.0
5052,7.0,5052.0,1.0,7.0,0.0,"['mentzer_index', 'rbc', 'iron', 'mcv', 'ferri...",3.0,3.0


In [71]:
b3_index_list = list(tibc_ida_df.index)
b3_x_test_df = X_test_df.iloc[b3_index_list]
b3_x_test_df

Unnamed: 0,ferritin,ret_count,segmented_neutrophils,iron,tibc,rbc,mcv,mentzer_index,label
3476,1.882835,0.0,0.0,151.140276,309.885395,2.767909,78.672609,28.423117,3
3553,1.836332,0.0,0.0,151.640269,113.897048,3.129228,74.955135,23.953234,3
3956,1.737838,0.0,0.0,113.872103,211.983293,1.190145,78.387423,65.863742,3
5052,1.765401,0.0,0.0,20.020334,298.601157,1.93975,77.485236,39.945995,3


#### 4 - Vitamin b12

In [72]:
vit_test_df = test_df[test_df.y_pred == 4]
vit_x_test_df = X_test_df[X_test_df.label== 4]

In [73]:
a4= vit_test_df[vit_test_df.y_actual != vit_test_df.y_pred]
a4.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
