In [17]:
import ast
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

#### The data used

In [18]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26868, 6), (11515, 6), (26868,), (11515,))

In [19]:
test_df = pd.read_csv('test_dfs/test_df_with_hb_some_nans_2e6.csv')
test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
1,4.0,1.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0
2,4.0,2.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0
3,2.0,3.0,1.0,2.0,0.0,"['hemoglobin', 'No anemia']",0.0,0.0
4,4.0,4.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0


In [20]:
X_test_df = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], axis=1)
X_test_df.columns = df.columns
X_test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1
1,5.847005,0.0,0.364588,0.0,0.0,87.13616,2
2,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1
3,15.224254,0.0,0.0,0.0,361.625413,0.0,0
4,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2


In [23]:
combined_test_df = pd.concat([test_df, X_test_df], axis=1)
combined_test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
0,4.0,0.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1
1,4.0,1.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0,5.847005,0.0,0.364588,0.0,0.0,87.13616,2
2,4.0,2.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Hemolytic ...",1.0,1.0,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1
3,2.0,3.0,1.0,2.0,0.0,"['hemoglobin', 'No anemia']",0.0,0.0,15.224254,0.0,0.0,0.0,361.625413,0.0,0
4,4.0,4.0,1.0,4.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",2.0,2.0,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2


In [24]:
analysis_test_df = combined_test_df[(combined_test_df.y_actual==1) & (combined_test_df.y_pred==2)]
len(analysis_test_df)

37

In [25]:
analysis_test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
302,4.0,302.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0,7.033549,0.0,2.003333,0.0,0.0,92.753463,1
742,4.0,742.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0,11.69912,91.223618,2.01381,0.0,0.0,91.334704,1
775,4.0,775.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0,6.490077,0.0,2.039745,0.0,0.0,98.517825,1
879,4.0,879.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0,3.405622,165.887192,2.0133,0.0,0.0,95.678177,1
1162,4.0,1162.0,0.0,2.0,0.0,"['hemoglobin', 'mcv', 'ret_count', 'Aplastic a...",1.0,2.0,8.459647,0.0,2.0254,0.0,423.597765,92.446936,1


In [28]:
analysis_test_df.ret_count.std()

0.015891287418980313

In [29]:
analysis_test_df.ret_count.mean()

2.021073613571375

#### Shorter paths 

In [5]:
a1 = np.array(test_df['y_actual'], dtype=np.float32)
a2 = np.array(X_test_df['label'], dtype=np.float32)
assert np.array_equal(a1, a2)

In [6]:
def get_short_paths(anemia_num, standard_len):
    anem_test_df = test_df[(test_df.y_pred == anemia_num) & (test_df.y_actual==anemia_num)]
    anem_idx = anem_test_df.index
    anem_x_test_df = X_test_df.loc[anem_idx]
    combined_anem_test_df = pd.concat([anem_test_df, anem_x_test_df], axis=1)
    combined_anem_test_df['trajectory'] = combined_anem_test_df.trajectory.apply(lambda x: ast.literal_eval(x))
    anem_short_indices = []
    for i, traj in combined_anem_test_df['trajectory'].iteritems():
        if len(traj) < standard_len:
            anem_short_indices.append(i)
    short_df =  combined_anem_test_df.loc[anem_short_indices]
    return short_df

#### 0 - No anemia
This doesn't qualify for analysis because its pathway is already too short (just hemoglobin)

In [7]:
no_short_df = get_short_paths(0, 2)
no_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label


#### 1 - Hemolytic Anemia

In [8]:
hemolytic_short_df = get_short_paths(1, 4)
hemolytic_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label


#### 2- Aplastic Anemia

In [9]:
aplastic_short_df = get_short_paths(2, 4)
aplastic_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label


#### 3 - Iron deficiency anemia 

In [15]:
ida_short_df = get_short_paths(3, 5)
ida_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label


#### 4 - Vitamin b12

In [11]:
vit_short_df = get_short_paths(4, 4)
vit_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label


#### 5 - Anemia of chronic disease

In [16]:
ida_short_df = get_short_paths(5, 5)
ida_short_df

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label
33,5.0,33.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,6.391260,2644.506310,0.000000,0.0,206.124879,72.764850,5
72,5.0,72.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,1.061561,637.533935,0.000000,0.0,327.209061,78.020080,5
101,5.0,101.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,2.851050,2784.776219,1.234847,0.0,394.671148,76.839887,5
156,5.0,156.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,5.890603,2965.755950,0.000000,0.0,330.271814,77.107361,5
170,5.0,170.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,10.259760,1176.592963,0.000000,0.0,242.818438,77.642315,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11350,5.0,11350.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,10.329595,5424.125944,0.000000,0.0,159.281709,78.158818,5
11373,5.0,11373.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,1.138146,3332.655841,0.000000,0.0,355.700076,75.140008,5
11405,5.0,11405.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,8.548673,864.647384,1.946863,0.0,271.293446,78.574078,5
11459,5.0,11459.0,1.0,5.0,0.0,"[hemoglobin, mcv, ferritin, tibc, Anemia of ch...",5.0,5.0,6.529708,4585.374530,0.000000,0.0,320.563259,78.510748,5
