In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import random
import os
import torch
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [13]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

#### The Data

In [14]:
# df = pd.read_csv('../../data/more_features/more_feats_correlated_testing_noanem_noisy_6.csv')
# df = df.fillna(-1)
# df.head()
train_df = pd.read_csv('../../data/more_features/train_sets/train_set_missing_2.csv')
train_df = train_df.fillna(-1)
train_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,7.559457,227.891116,5.805018,-1.0,147.846935,103.948918,106.431924,2.181684,1,-1.0,2.986261,114.459461,13.468107,15.321222,-1.0,22.67837,71.987914,1
1,12.151687,49.933676,-1.0,0.731308,206.283448,88.20535,66.458334,4.132976,0,-1.0,72.488033,85.729484,20.88191,11.104061,-1.0,36.455062,32.216998,0
2,7.684368,244.551542,-1.0,0.0,-1.0,103.17902,-1.0,2.234282,0,0.683808,-1.0,-1.0,77.068526,-1.0,112.25213,23.053104,-1.0,2
3,6.783503,-1.0,1.180064,0.0,-1.0,-1.0,181.424498,1.97035,1,-1.0,-1.0,95.770863,-1.0,1.605937,-1.0,20.350509,98.314019,2
4,6.162946,25.460454,2.241183,0.0,-1.0,104.151962,246.116674,-1.0,1,0.842525,-1.0,-1.0,-1.0,18.002089,-1.0,-1.0,-1.0,2


In [15]:
test_df = pd.read_csv('../../data/more_features/train_sets/test_set_constant.csv')
test_df.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat,label
0,11.391136,-1.0,1.504298,5.058881,467.197112,96.252926,217.860499,3.550376,1,-1.0,36.70381,122.310168,49.897027,20.353251,-1.0,34.173407,46.631388,6
1,10.355048,272.506966,-1.0,0.0,128.706486,101.912313,-1.0,3.048223,1,-1.0,-1.0,-1.0,29.327349,-1.0,-1.0,31.065144,-1.0,2
2,11.159385,-1.0,-1.0,0.0,-1.0,103.395616,77.212369,3.23787,0,-1.0,8.690122,65.554731,19.167967,7.239049,-1.0,33.478155,-1.0,2
3,8.179735,140.876632,5.484515,-1.0,303.740826,99.877458,155.573175,2.456931,0,-1.0,41.123526,119.43384,39.384848,6.394235,-1.0,24.539204,51.219053,5
4,9.916825,-1.0,3.90981,-1.0,-1.0,90.543986,189.347916,3.285748,0,1.501433,34.905667,59.177001,58.538524,22.843594,139.245204,29.750475,-1.0,5


In [16]:
X_train = train_df.iloc[:, 0:-1]
y_train = train_df.iloc[:, -1]

X_test = test_df.iloc[:, 0:-1]
y_test = test_df.iloc[:, -1]

X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

In [17]:
action_list = list(constants.CLASS_DICT.keys()) + [col  for col in train_df.columns if col!='label']
action_list

['No anemia',
 'Vitamin B12/Folate deficiency anemia',
 'Unspecified anemia',
 'Anemia of chronic disease',
 'Iron deficiency anemia',
 'Hemolytic anemia',
 'Aplastic anemia',
 'Inconclusive diagnosis',
 'hemoglobin',
 'ferritin',
 'ret_count',
 'segmented_neutrophils',
 'tibc',
 'mcv',
 'serum_iron',
 'rbc',
 'gender',
 'creatinine',
 'cholestrol',
 'copper',
 'ethanol',
 'folate',
 'glucose',
 'hematocrit',
 'tsat']

#### Training 

In [18]:
for steps in [int(8e6), int(8.5e6), int(9e6)]:
#for steps in [int(7.5e6), int(7.8e6), int(8.3e6)]:
    #start_time = datetime.now()
    dqn_model = utils.stable_dqn3(X_train, y_train, steps, True, 
                                  f'../../models/many_features/0.1/with_correlated_fts/constant_test_set/dqn_missing_2_{steps}')
    #end_time = datetime.now()
    #print(f'The duration for {steps} steps is {end_time-start_time}')

using stable baselines 3
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.1      |
|    ep_rew_mean      | -0.78    |
|    exploration_rate | 0.65     |
|    success_rate     | 0.11     |
| time/               |          |
|    episodes         | 100000   |
|    fps              | 1044     |
|    time_elapsed     | 281      |
|    total_timesteps  | 294608   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.2      |
|    n_updates        | 61151    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.34     |
|    ep_rew_mean      | -0.96    |
|    exploration_rate | 0.235    |
|    success_rate     | 0.09     |
| time/               |          |
|    episodes         | 200000   |
|    fps              | 827      |
|    t

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.46     |
|    ep_rew_mean      | -0.08    |
|    exploration_rate | 0.05     |
|    success_rate     | 0.51     |
| time/               |          |
|    episodes         | 1600000  |
|    fps              | 474      |
|    time_elapsed     | 11638    |
|    total_timesteps  | 5520285  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.2      |
|    n_updates        | 1367571  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.78     |
|    ep_rew_mean      | 0.06     |
|    exploration_rate | 0.05     |
|    success_rate     | 0.59     |
| time/               |          |
|    episodes         | 1700000  |
|    fps              | 467      |
|    time_elapsed     | 12623    |
|    total_timesteps  | 5897617  |
| train/              |          |
|    learning_rate  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.78     |
|    ep_rew_mean      | -0.32    |
|    exploration_rate | 0.05     |
|    success_rate     | 0.37     |
| time/               |          |
|    episodes         | 900000   |
|    fps              | 635      |
|    time_elapsed     | 4857     |
|    total_timesteps  | 3088730  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.161    |
|    n_updates        | 759682   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.71     |
|    ep_rew_mean      | -0.22    |
|    exploration_rate | 0.05     |
|    success_rate     | 0.43     |
| time/               |          |
|    episodes         | 1000000  |
|    fps              | 629      |
|    time_elapsed     | 5461     |
|    total_timesteps  | 3438695  |
| train/              |          |
|    learning_rate  

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.92     |
|    ep_rew_mean      | 0.08     |
|    exploration_rate | 0.05     |
|    success_rate     | 0.6      |
| time/               |          |
|    episodes         | 2400000  |
|    fps              | 511      |
|    time_elapsed     | 16546    |
|    total_timesteps  | 8459697  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.118    |
|    n_updates        | 2102424  |
----------------------------------
using stable baselines 3
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.24     |
|    ep_rew_mean      | -0.82    |
|    exploration_rate | 0.692    |
|    success_rate     | 0.09     |
| time/               |          |
|    episodes         | 100000   |
|    fps              | 776      |
|    t

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 3.91     |
|    ep_rew_mean      | -0.02    |
|    exploration_rate | 0.05     |
|    success_rate     | 0.5      |
| time/               |          |
|    episodes         | 1500000  |
|    fps              | 591      |
|    time_elapsed     | 8857     |
|    total_timesteps  | 5242210  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0804   |
|    n_updates        | 1298052  |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.11     |
|    ep_rew_mean      | -0.02    |
|    exploration_rate | 0.05     |
|    success_rate     | 0.54     |
| time/               |          |
|    episodes         | 1600000  |
|    fps              | 593      |
|    time_elapsed     | 9423     |
|    total_timesteps  | 5596289  |
| train/              |          |
|    learning_rate  

In [8]:
# class_dict = constants.CLASS_DICT
# df['label'] = df['label'].replace(class_dict)
# X = df.iloc[:, 0:-1]
# y = df.iloc[:, -1]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

In [22]:
# %%time
# timesteps = int(2e6)
# dqn_model = utils.stable_dqn3(X_train, y_train, timesteps, True, f'../../models/many_features/stable_dqn3_{timesteps}')
# test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
# test_df.head()

#### Testing

In [None]:
# training_env = utils.create_env(X_train, y_train)
# dqn_model = utils.load_dqn3('../../models/many_features/0.1/with_correlated_fts/dqn3_by_type_noisy_6_16000000', training_env)
# test_df = utils.evaluate_dqn(dqn_model, X_test, y_test)
# test_df.head()

In [None]:
success_rate, success_df = utils.success_rate(test_df)
success_rate

In [None]:
avg_length, avg_return = utils.get_avg_length_reward(test_df)
avg_length, avg_return

In [None]:
acc, f1, roc_auc = utils.test(test_df['y_actual'], test_df['y_pred'])
acc, f1, roc_auc

In [None]:
test_df.y_pred.unique()

In [None]:
test_df[test_df.y_pred==4]

#### Saving files

In [None]:
# test_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/test_df3_9000000.csv', index=False)
# success_df.to_csv(f'../../test_dfs/many_features/0.1/correlated/success_df3_9000000.csv', index=False)

#### Confusion matrix and classification report

In [None]:
test_df = pd.read_csv('../../test_dfs/many_features/0.1/test_df3_6500000.csv')
test_df.head()

In [None]:
utils.plot_classification_report(test_df['y_actual'], test_df['y_pred'])

In [None]:
# def plot_confusion_matrix(y_actual, y_pred, save=False, filename=False):
#     from sklearn.metrics import confusion_matrix
#     cm = confusion_matrix(y_actual, y_pred)
#     cm_df = pd.DataFrame(cm, index = [0, 1, 2, 3, 4, 5, 6], columns = [0, 1, 2, 3, 4, 5, 6], dtype='object')
#     #cm_df = pd.DataFrame(cm, index = constants.CLASS_DICT.keys(), columns = constants.CLASS_DICT.keys())
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm_df, annot=True)
#     plt.title('Confusion Matrix')
#     plt.ylabel('Actual Anemia')
#     plt.xlabel('Predicted Anemia')
#     plt.tight_layout()
#     if save:
#         plt.savefig(filename)
#     plt.show()
#     plt.close()

In [None]:
utils.plot_confusion_matrix(test_df['y_actual'], test_df['y_pred'])