In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

#### The Data

In [3]:
#df = pd.read_csv('../../data/anemia_synth_dataset_some_nans_unspecified_more_feats.csv')
#df = pd.read_csv('../../data/more_feats_0.2.csv')
#df= pd.read_csv('../../data/more_features/more_feats_new_labels_0.1.csv')
#df =pd.read_csv('../../data/more_features/more_feats_new_labels_0.1_noisy_0.6.csv')
df = pd.read_csv('../../data/more_features/more_feats_correlated_noisy_2.csv')
#df = utils.balance_dataset(df, 8000)
df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,6.863691,341.823457,-1.0,0.681783,284.835163,92.819484,-1.0,2.2184,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,20.591074,-1.0,Inconclusive diagnosis
1,8.74595,104.470248,0.461482,0.787183,409.993509,104.531457,77.222298,2.510043,1,0.713849,77.276464,51.527756,7.722555,7.822857,-1.0,26.23785,18.835005,Vitamin B12/Folate deficiency anemia
2,10.308881,475.936322,-1.0,6.155778,-1.0,104.64724,95.040788,2.955323,1,-1.0,18.285577,100.169515,54.471371,11.239513,-1.0,30.926642,-1.0,Vitamin B12/Folate deficiency anemia
3,7.525442,-1.0,1.972946,2.172161,-1.0,97.271565,-1.0,2.320958,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22.576325,-1.0,No anemia
4,9.54487,411.496642,0.891182,0.0,236.428214,104.721025,-1.0,2.734371,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.634611,-1.0,Unspecified anemia


In [4]:
df.isna().sum()

hemoglobin               0
ferritin                 0
ret_count                0
segmented_neutrophils    0
tibc                     0
mcv                      0
serum_iron               0
rbc                      0
gender                   0
creatinine               0
cholestrol               0
copper                   0
ethanol                  0
folate                   0
glucose                  0
hematocrit               0
tsat                     0
label                    0
dtype: int64

In [5]:
utils.get_dt_performance(df)

(0.7522857142857143,
 0.7607225650220057,
 0.8607409313271263,
 datetime.timedelta(microseconds=9207))

In [6]:
df.label.value_counts()

No anemia                               16000
Anemia of chronic disease                8803
Iron deficiency anemia                   8304
Aplastic anemia                          8158
Unspecified anemia                       8106
Hemolytic anemia                         8075
Vitamin B12/Folate deficiency anemia     8074
Inconclusive diagnosis                   4480
Name: label, dtype: int64

In [7]:
class_dict = constants.CLASS_DICT
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

In [8]:
action_list = list(class_dict.keys()) + [col  for col in df.columns if col!='label']
action_list

['No anemia',
 'Vitamin B12/Folate deficiency anemia',
 'Unspecified anemia',
 'Anemia of chronic disease',
 'Iron deficiency anemia',
 'Hemolytic anemia',
 'Aplastic anemia',
 'Inconclusive diagnosis',
 'hemoglobin',
 'ferritin',
 'ret_count',
 'segmented_neutrophils',
 'tibc',
 'mcv',
 'serum_iron',
 'rbc',
 'gender',
 'creatinine',
 'cholestrol',
 'copper',
 'ethanol',
 'folate',
 'glucose',
 'hematocrit',
 'tsat']

In [9]:
len(action_list)

25

In [10]:
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,6.863691,341.823457,-1.0,0.681783,284.835163,92.819484,-1.0,2.2184,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,20.591074,-1.0,7
1,8.74595,104.470248,0.461482,0.787183,409.993509,104.531457,77.222298,2.510043,1,0.713849,77.276464,51.527756,7.722555,7.822857,-1.0,26.23785,18.835005,1
2,10.308881,475.936322,-1.0,6.155778,-1.0,104.64724,95.040788,2.955323,1,-1.0,18.285577,100.169515,54.471371,11.239513,-1.0,30.926642,-1.0,1
3,7.525442,-1.0,1.972946,2.172161,-1.0,97.271565,-1.0,2.320958,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22.576325,-1.0,0
4,9.54487,411.496642,0.891182,0.0,236.428214,104.721025,-1.0,2.734371,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.634611,-1.0,2


#### Testing

In [11]:
training_env = utils.create_env(X_train, y_train)
dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_2_9500000', training_env)
test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
test_df.head()

Using stable baselines 3


ModuleNotFoundError: No module named 'numpy.random._pickle'

In [84]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

63.39285714285714

In [80]:
avg_length, avg_return = utils.get_avg_length_reward(test_df)
avg_length, avg_return

(4.507571428571429, 0.19185714285714286)

In [81]:
acc, f1, roc_auc = utils.test(test_df['y_actual'], test_df['y_pred'])
acc, f1, roc_auc

(0.6339285714285714, 0.5942366223722848, 0.7910419800438859)

In [82]:
test_df.y_pred.unique()

array([0., 4., 1., 3., 7., 5., 6., 2.])

In [67]:
test_df[test_df.y_pred==4]

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
2,7.0,2.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, gender, rbc, ferritin,...",4.0,4.0
12,5.0,12.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, gender, Iron deficienc...",4.0,4.0
20,6.0,20.0,0.0,-1.0,0.0,"[hemoglobin, mcv, tibc, gender, ferritin, Iron...",0.0,4.0
23,6.0,23.0,0.0,-1.0,0.0,"[hemoglobin, mcv, tibc, gender, glucose, Iron ...",3.0,4.0
47,4.0,47.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
...,...,...,...,...,...,...,...,...
13961,4.0,13961.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
13975,6.0,13975.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, gender, rbc, Iron defi...",4.0,4.0
13976,7.0,13976.0,1.0,1.0,0.0,"[hemoglobin, gender, mcv, tibc, rbc, ferritin,...",4.0,4.0
13982,4.0,13982.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0


#### Saving files

In [None]:
# test_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/test_df3_9000000.csv', index=False)
# success_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/success_df3_9000000.csv', index=False)

#### Confusion matrix and classification report

In [None]:
test_df = pd.read_csv('../../test_dfs/many_features/0.1/test_df3_6500000.csv')
test_df.head()

In [None]:
utils.plot_classification_report(test_df['y_actual'], test_df['y_pred'])

In [None]:
# def plot_confusion_matrix(y_actual, y_pred, save=False, filename=False):
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(y_actual, y_pred)
#     cm_df = pd.DataFrame(cm, index = [0, 1, 2, 3, 4, 5, 6], columns = [0, 1, 2, 3, 4, 5, 6], dtype='object')
#     #cm_df = pd.DataFrame(cm, index = constants.CLASS_DICT.keys(), columns = constants.CLASS_DICT.keys())
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm_df, annot=True)
#     plt.title('Confusion Matrix')
#     plt.ylabel('Actual Anemia')
#     plt.xlabel('Predicted Anemia')
#     plt.tight_layout()
#     if save:
#         plt.savefig(filename)
#     plt.show()
#     plt.close()

In [None]:
utils.plot_confusion_matrix(test_df['y_actual'], test_df['y_pred'])