In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

#### The Data

In [4]:
# df = pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
# df = df.fillna(-1)
# df.head()

In [5]:
test_df = pd.read_csv('../../final/data/test_set_constant.csv')
test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.783006,112.011007,-1.0,3.308648,348.014906,86.577465,-1.0,4.082935,0,1.617594,131.868365,-1.0,-1.0,-1.0,-1.0,35.349018,-1.0,7
1,12.609666,104.475099,5.445972,-1.0,-1.0,75.082198,-1.0,5.038345,1,0.334722,97.568987,83.16188,-1.0,7.14727,-1.0,37.828999,-1.0,3
2,7.940172,4.9254,3.25513,5.766738,209.031102,75.254082,-1.0,3.165345,1,1.649478,2.08297,-1.0,-1.0,-1.0,-1.0,23.820517,-1.0,4
3,7.78584,103.757096,2.874384,3.839597,-1.0,81.873581,39.63801,2.852876,1,0.637843,127.412065,38.53476,51.437387,28.150784,73.296289,23.357519,-1.0,5
4,12.359715,51.274141,0.178269,1.233844,474.589534,76.469181,49.839768,4.8489,1,0.621034,137.433107,55.314614,47.302738,3.808612,60.076211,37.079144,10.501658,4


In [6]:
X_test = test_df.iloc[:, 0:-1]
y_test = test_df.iloc[:, -1]

X_test, y_test = np.array(X_test), np.array(y_test)
X_test.shape, y_test.shape

((14000, 17), (14000,))

In [17]:
# utils.get_dt_performance(df)

In [18]:
# df.label.value_counts()

In [19]:
# class_dict = constants.CLASS_DICT
# df['label'] = df['label'].replace(class_dict)
# X = df.iloc[:, 0:-1]
# y = df.iloc[:, -1]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [20]:
# y_test[:5]

In [21]:
# action_list = list(class_dict) + [col  for col in df.columns if col!='label']
# action_list

In [22]:
# len(action_list)

#### Testing

In [24]:
# training_env = utils.create_env(X_train, y_train)
# dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/constant_test_set/dqn_missing_1_6000000', 
#                             training_env)
# test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
# test_df.head()

In [68]:
dqn_model = utils.load_dqn3('../../final/models/dqn_basic_5500000')
test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)

Using stable baselines 3
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....


In [69]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

96.22857142857143

In [70]:
avg_length, avg_return = utils.get_avg_length_reward(test_df)
avg_length, avg_return

(4.730071428571429, 0.7711428571428571)

In [71]:
acc, f1, roc_auc = utils.test(test_df['y_actual'], test_df['y_pred'])
acc, f1, roc_auc

(0.9622857142857143, 0.9578272224615978, 0.9790564699675143)

In [72]:
test_df.y_pred.unique()

array([7., 4., 5., 2., 6., 1., 3., 0.])

In [73]:
test_df[test_df.y_pred==4]

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
2,5.0,2.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, Iron defic...",4.0,4.0
4,6.0,4.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, tibc, Iron...",4.0,4.0
11,6.0,11.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, tibc, Iron...",4.0,4.0
28,5.0,28.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, Iron defic...",4.0,4.0
31,5.0,31.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, Iron defic...",4.0,4.0
...,...,...,...,...,...,...,...,...
13976,6.0,13976.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, tibc, Iron...",4.0,4.0
13987,5.0,13987.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, Iron defic...",4.0,4.0
13989,6.0,13989.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, tibc, Iron...",4.0,4.0
13997,6.0,13997.0,1.0,1.0,0.0,"[hematocrit, gender, mcv, ferritin, tibc, Iron...",4.0,4.0


#### Saving files

In [27]:
# for steps in [int(9.5e6), int(10e6), int(10.5e6), int(11e6), int(11.5e6), int(12e6), int(12.5e6), int(13e6), int(13.5e6), int(15e6), int(16e6), int(17e6), int(18e6), int(19e6), int(20e6), int(21e6), int(22e6), int(23e6), int(24e6)]:
#     training_env = utils.create_env(X_train, y_train)
#     dqn_model = utils.load_dqn3(f'../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_6_{steps}', 
#                                 training_env)
#     test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
#     success_rate, success_df = utils.success_rate(test_df)
#     avg_length, avg_return = utils.get_avg_length_reward(test_df)
#     print(f'steps: {steps}, success_rate: {success_rate}, avg_length: {avg_length}, avg_return: {avg_return}')

In [74]:
test_df.to_csv(f'../../final/test_dfs/dqn_test_df_basic_5500000.csv', index=False)
success_df.to_csv(f'../../final/test_dfs/dqn_success_df_basic_5500000.csv', index=False)

#### Confusion matrix and classification report

In [None]:
# test_df = pd.read_csv('../../test_dfs/many_features/0.1/correlated/test_df3_missing3_12500000.csv')
# test_df.head()

In [None]:
utils.plot_classification_report(test_df['y_actual'], test_df['y_pred'])

In [None]:
# def plot_confusion_matrix(y_actual, y_pred, save=False, filename=False):
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(y_actual, y_pred)
#     cm_df = pd.DataFrame(cm, index = [0, 1, 2, 3, 4, 5, 6], columns = [0, 1, 2, 3, 4, 5, 6], dtype='object')
#     #cm_df = pd.DataFrame(cm, index = constants.CLASS_DICT.keys(), columns = constants.CLASS_DICT.keys())
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm_df, annot=True)
#     plt.title('Confusion Matrix')
#     plt.ylabel('Actual Anemia')
#     plt.xlabel('Predicted Anemia')
#     plt.tight_layout()
#     if save:
#         plt.savefig(filename)
#     plt.show()
#     plt.close()

In [None]:
utils.plot_confusion_matrix(test_df['y_actual'], test_df['y_pred'])