In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

#### The Data

In [6]:
df = pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
df = df.fillna(-1)
df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.183192,187.573466,4.951674,1.661383,316.593436,95.006424,181.242992,3.531295,1,-1.0,28.040619,96.721542,49.530542,23.186628,-1.0,33.549575,57.247868,Hemolytic anemia
1,14.387445,-1.0,-1.0,-1.0,-1.0,-1.0,125.249617,-1.0,1,-1.0,98.357508,112.758764,62.464566,7.16892,-1.0,43.162335,-1.0,No anemia
2,12.749357,5.012158,3.5028,6.179371,498.418768,76.759285,159.834784,4.982859,1,1.022939,56.850479,75.739552,72.072041,20.600875,44.872138,38.248071,32.068372,Iron deficiency anemia
3,11.50887,197.180945,1.200125,0.0,457.033309,102.900301,131.177927,3.355346,1,-1.0,111.220307,66.999185,18.353272,14.132423,-1.0,34.526609,28.70205,Unspecified anemia
4,9.456656,427.952052,-1.0,0.660252,-1.0,104.543774,-1.0,2.713693,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.369968,-1.0,Vitamin B12/Folate deficiency anemia


In [24]:
test_df = pd.read_csv('../../data/more_features/train_sets/test_set_constant.csv')
test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.391136,-1.0,1.504298,5.058881,467.197112,96.252926,217.860499,3.550376,1,-1.0,36.70381,122.310168,49.897027,20.353251,-1.0,34.173407,46.631388,6
1,10.355048,272.506966,-1.0,0.0,128.706486,101.912313,-1.0,3.048223,1,-1.0,-1.0,-1.0,29.327349,-1.0,-1.0,31.065144,-1.0,2
2,11.159385,-1.0,-1.0,0.0,-1.0,103.395616,77.212369,3.23787,0,-1.0,8.690122,65.554731,19.167967,7.239049,-1.0,33.478155,-1.0,2
3,8.179735,140.876632,5.484515,-1.0,303.740826,99.877458,155.573175,2.456931,0,-1.0,41.123526,119.43384,39.384848,6.394235,-1.0,24.539204,51.219053,5
4,9.916825,-1.0,3.90981,-1.0,-1.0,90.543986,189.347916,3.285748,0,1.501433,34.905667,59.177001,58.538524,22.843594,139.245204,29.750475,-1.0,5


In [7]:
utils.get_dt_performance(df)

(0.9996428571428572,
 0.9996136838530153,
 0.9997821980021458,
 datetime.timedelta(microseconds=2997))

In [8]:
df.label.value_counts()

No anemia                               10000
Anemia of chronic disease                9756
Iron deficiency anemia                   9267
Unspecified anemia                       9033
Aplastic anemia                          9020
Vitamin B12/Folate deficiency anemia     9000
Hemolytic anemia                         8976
Inconclusive diagnosis                   4948
Name: label, dtype: int64

In [9]:
class_dict = constants.CLASS_DICT
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

In [26]:
y_test[:5]

array([6, 2, 2, 5, 5], dtype=int64)

In [11]:
action_list = list(class_dict) + [col  for col in df.columns if col!='label']
action_list

['No anemia',
 'Vitamin B12/Folate deficiency anemia',
 'Unspecified anemia',
 'Anemia of chronic disease',
 'Iron deficiency anemia',
 'Hemolytic anemia',
 'Aplastic anemia',
 'Inconclusive diagnosis',
 'hemoglobin',
 'ferritin',
 'ret_count',
 'segmented_neutrophils',
 'tibc',
 'mcv',
 'serum_iron',
 'rbc',
 'gender',
 'creatinine',
 'cholestrol',
 'copper',
 'ethanol',
 'folate',
 'glucose',
 'hematocrit',
 'tsat']

In [12]:
len(action_list)

25

#### Testing

In [123]:
training_env = utils.create_env(X_train, y_train)
dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_4_19000000', 
                            training_env)
test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
test_df.head()

Using stable baselines 3
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....


Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,1.0,0.0,"[hemoglobin, rbc, ret_count, Aplastic anemia]",6.0,6.0
1,5.0,1.0,1.0,1.0,0.0,"[hemoglobin, rbc, segmented_neutrophils, gende...",2.0,2.0
2,4.0,2.0,1.0,1.0,0.0,"[hemoglobin, rbc, segmented_neutrophils, Unspe...",2.0,2.0
3,4.0,3.0,0.0,-1.0,1.0,"[hemoglobin, rbc, segmented_neutrophils, Incon...",5.0,7.0
4,4.0,4.0,1.0,1.0,0.0,"[hemoglobin, rbc, ret_count, Hemolytic anemia]",5.0,5.0


In [132]:
dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_6_17000000')
test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)

Using stable baselines 3
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....


In [133]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

94.37857142857143

In [129]:
for steps in [int(9.5e6), int(10e6), int(10.5e6), int(11e6), int(11.5e6), int(12e6), int(12.5e6), int(13e6), int(13.5e6), int(15e6), int(16e6), int(17e6), int(18e6), int(19e6), int(20e6), int(21e6), int(22e6), int(23e6), int(24e6)]:
    training_env = utils.create_env(X_train, y_train)
    dqn_model = utils.load_dqn3(f'../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_6_{steps}', 
                                training_env)
    test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
    success_rate, success_df = utils.success_rate(test_df)
    avg_length, avg_return = utils.get_avg_length_reward(test_df)
    print(f'steps: {steps}, success_rate: {success_rate}, avg_length: {avg_length}, avg_return: {avg_return}')

Using stable baselines 3
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....
steps: 9500000, success_rate: 62.3, avg_length: 4.4875, avg_return: 0.18142857142857144
Using stable baselines 3
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....
steps: 10000000, success_rate: 55.67857142857143, avg_length: 4.924214285714286, avg_return: 0.05242857142857143
Using stable baselines 3
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....
steps: 10500000, success_rate: 80.20714285714286, avg_length: 4.143285714285715, avg_return: 0.512
Using stable baselines 3
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Cou

In [124]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

96.36428571428571

In [125]:
avg_length, avg_return = utils.get_avg_length_reward(test_df)
avg_length, avg_return

(4.425357142857143, 0.787)

In [126]:
acc, f1, roc_auc = utils.test(test_df['y_actual'], test_df['y_pred'])
acc, f1, roc_auc

(0.9636428571428571, 0.9579371653653614, 0.9803236025871085)

In [127]:
test_df.y_pred.unique()

array([6., 2., 7., 5., 0., 4., 3., 1.])

In [112]:
test_df[test_df.y_pred==4]

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
8,4.0,8.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
16,5.0,16.0,1.0,1.0,0.0,"[hemoglobin, gender, rbc, tibc, Iron deficienc...",4.0,4.0
23,4.0,23.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
31,4.0,31.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
41,4.0,41.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
...,...,...,...,...,...,...,...,...
13968,4.0,13968.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
13989,4.0,13989.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
13995,4.0,13995.0,1.0,1.0,0.0,"[hemoglobin, rbc, tibc, Iron deficiency anemia]",4.0,4.0
13996,5.0,13996.0,1.0,1.0,0.0,"[hemoglobin, gender, rbc, tibc, Iron deficienc...",4.0,4.0


#### Saving files

In [87]:
test_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/constant_test_set/test_df_noisy2_10000000.csv', index=False)
success_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/constant_test_set/success_df_noisy2_10000000.csv', index=False)

#### Confusion matrix and classification report

In [None]:
# test_df = pd.read_csv('../../test_dfs/many_features/0.1/correlated/test_df3_missing3_12500000.csv')
# test_df.head()

In [None]:
utils.plot_classification_report(test_df['y_actual'], test_df['y_pred'])

In [None]:
# def plot_confusion_matrix(y_actual, y_pred, save=False, filename=False):
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(y_actual, y_pred)
#     cm_df = pd.DataFrame(cm, index = [0, 1, 2, 3, 4, 5, 6], columns = [0, 1, 2, 3, 4, 5, 6], dtype='object')
#     #cm_df = pd.DataFrame(cm, index = constants.CLASS_DICT.keys(), columns = constants.CLASS_DICT.keys())
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm_df, annot=True)
#     plt.title('Confusion Matrix')
#     plt.ylabel('Actual Anemia')
#     plt.xlabel('Predicted Anemia')
#     plt.tight_layout()
#     if save:
#         plt.savefig(filename)
#     plt.show()
#     plt.close()

In [None]:
utils.plot_confusion_matrix(test_df['y_actual'], test_df['y_pred'])