In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [13]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

#### The Data

In [14]:
# df = pd.read_csv('../../data/more_features/more_feats_correlated_0.1.csv')
# df = df.fillna(-1)
# df.head()

In [15]:
test_df = pd.read_csv('../../data/more_features/train_sets/test_set_constant.csv')
test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.391136,-1.0,1.504298,5.058881,467.197112,96.252926,217.860499,3.550376,1,-1.0,36.70381,122.310168,49.897027,20.353251,-1.0,34.173407,46.631388,6
1,10.355048,272.506966,-1.0,0.0,128.706486,101.912313,-1.0,3.048223,1,-1.0,-1.0,-1.0,29.327349,-1.0,-1.0,31.065144,-1.0,2
2,11.159385,-1.0,-1.0,0.0,-1.0,103.395616,77.212369,3.23787,0,-1.0,8.690122,65.554731,19.167967,7.239049,-1.0,33.478155,-1.0,2
3,8.179735,140.876632,5.484515,-1.0,303.740826,99.877458,155.573175,2.456931,0,-1.0,41.123526,119.43384,39.384848,6.394235,-1.0,24.539204,51.219053,5
4,9.916825,-1.0,3.90981,-1.0,-1.0,90.543986,189.347916,3.285748,0,1.501433,34.905667,59.177001,58.538524,22.843594,139.245204,29.750475,-1.0,5


In [16]:
X_test = test_df.iloc[:, 0:-1]
y_test = test_df.iloc[:, -1]

X_test, y_test = np.array(X_test), np.array(y_test)
X_test.shape, y_test.shape

((14000, 17), (14000,))

In [17]:
# utils.get_dt_performance(df)

In [18]:
# df.label.value_counts()

In [19]:
# class_dict = constants.CLASS_DICT
# df['label'] = df['label'].replace(class_dict)
# X = df.iloc[:, 0:-1]
# y = df.iloc[:, -1]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [20]:
# y_test[:5]

In [21]:
# action_list = list(class_dict) + [col  for col in df.columns if col!='label']
# action_list

In [22]:
# len(action_list)

#### Testing

In [24]:
# training_env = utils.create_env(X_train, y_train)
# dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/constant_test_set/dqn_missing_1_6000000', 
#                             training_env)
# test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
# test_df.head()

In [41]:
dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/constant_test_set/dqn_missing_1_7500000')
test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)

Using stable baselines 3
Count: 2800
Count: 5600
Count: 8400
Count: 11200
Count: 14000
Testing done.....


In [42]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

64.11428571428571

In [43]:
avg_length, avg_return = utils.get_avg_length_reward(test_df)
avg_length, avg_return

(3.8257142857142856, 0.25057142857142856)

In [44]:
acc, f1, roc_auc = utils.test(test_df['y_actual'], test_df['y_pred'])
acc, f1, roc_auc

(0.6411428571428571, 0.5745141887304688, 0.7791104896808511)

In [45]:
test_df.y_pred.unique()

array([6., 1., 2., 0., 4., 3., 7.])

In [31]:
test_df[test_df.y_pred==4]

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
8,4.0,8.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
16,5.0,16.0,1.0,1.0,0.0,"[hemoglobin, gender, mcv, tibc, Iron deficienc...",4.0,4.0
23,4.0,23.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
31,4.0,31.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
41,4.0,41.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
...,...,...,...,...,...,...,...,...
13968,4.0,13968.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
13989,4.0,13989.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
13995,4.0,13995.0,1.0,1.0,0.0,"[hemoglobin, mcv, tibc, Iron deficiency anemia]",4.0,4.0
13996,5.0,13996.0,1.0,1.0,0.0,"[hemoglobin, gender, mcv, tibc, Iron deficienc...",4.0,4.0


#### Saving files

In [27]:
# for steps in [int(9.5e6), int(10e6), int(10.5e6), int(11e6), int(11.5e6), int(12e6), int(12.5e6), int(13e6), int(13.5e6), int(15e6), int(16e6), int(17e6), int(18e6), int(19e6), int(20e6), int(21e6), int(22e6), int(23e6), int(24e6)]:
#     training_env = utils.create_env(X_train, y_train)
#     dqn_model = utils.load_dqn3(f'../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_6_{steps}', 
#                                 training_env)
#     test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
#     success_rate, success_df = utils.success_rate(test_df)
#     avg_length, avg_return = utils.get_avg_length_reward(test_df)
#     print(f'steps: {steps}, success_rate: {success_rate}, avg_length: {avg_length}, avg_return: {avg_return}')

In [87]:
test_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/constant_test_set/test_df_noisy2_10000000.csv', index=False)
success_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/constant_test_set/success_df_noisy2_10000000.csv', index=False)

#### Confusion matrix and classification report

In [None]:
# test_df = pd.read_csv('../../test_dfs/many_features/0.1/correlated/test_df3_missing3_12500000.csv')
# test_df.head()

In [None]:
utils.plot_classification_report(test_df['y_actual'], test_df['y_pred'])

In [None]:
# def plot_confusion_matrix(y_actual, y_pred, save=False, filename=False):
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(y_actual, y_pred)
#     cm_df = pd.DataFrame(cm, index = [0, 1, 2, 3, 4, 5, 6], columns = [0, 1, 2, 3, 4, 5, 6], dtype='object')
#     #cm_df = pd.DataFrame(cm, index = constants.CLASS_DICT.keys(), columns = constants.CLASS_DICT.keys())
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm_df, annot=True)
#     plt.title('Confusion Matrix')
#     plt.ylabel('Actual Anemia')
#     plt.xlabel('Predicted Anemia')
#     plt.tight_layout()
#     if save:
#         plt.savefig(filename)
#     plt.show()
#     plt.close()

In [None]:
utils.plot_confusion_matrix(test_df['y_actual'], test_df['y_pred'])