In [2]:
import pandas as pd
import numpy as np
import random
import os
from random_agent import RandomAgent
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [4]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv') #my real dataset i think
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
class_dict

{'No anemia': 0,
 'Hemolytic anemia': 1,
 'Aplastic anemia': 2,
 'Iron deficiency anemia': 3,
 'Vitamin B12/Folate deficiency anemia': 4,
 'Anemia of chronic disease': 5}

In [5]:
df.iloc[7]

hemoglobin                 16.2061
ferritin                   214.005
ret_count                        0
segmented_neutrophils            0
tibc                       429.705
mcv                        95.9035
label                    No anemia
Name: 7, dtype: object

In [6]:
df['label'] = df['label'].replace(class_dict)
print(df.label.value_counts())
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

1    14146
0    10000
2     9450
5     1869
4     1575
3     1343
Name: label, dtype: int64


((26868, 6), (11515, 6), (26868,), (11515,))

In [7]:
rand_agent = RandomAgent(X_test, y_test)

In [8]:
test_df = rand_agent.test()

Testing done.....


In [12]:
test_df.to_csv('test_dfs/random_test_df.csv', index=False)

In [13]:
len(X_test), len(test_df)

(11515, 11515)

In [14]:
test_df.head()

Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,2.0,0.0,1.0,2.0,0.0,"[tibc, Hemolytic anemia]",1.0,1.0
1,1.0,1.0,0.0,-1.0,0.0,[No anemia],2.0,0.0
2,2.0,2.0,0.0,0.0,0.0,"[mcv, Vitamin B12/Folate deficiency anemia]",1.0,4.0
3,1.0,3.0,0.0,-1.0,0.0,[Iron deficiency anemia],0.0,3.0
4,1.0,4.0,0.0,-1.0,0.0,[Iron deficiency anemia],2.0,3.0


In [16]:
test_df.y_pred.isna().sum()/len(test_df)*100

0.729483282674772

In [9]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, auc, roc_curve

In [10]:
def multiclass(actual_class, pred_class, average = 'macro'):

    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        other_class = [x for x in unique_class if x != per_class]
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    avg = sum(roc_auc_dict.values()) / len(roc_auc_dict)
    return avg

In [11]:
def test(ytest, ypred):
    acc = accuracy_score(ytest, ypred)
    f1 = f1_score(ytest, ypred, average ='macro', labels=np.unique(ytest))
    try:
        roc_auc = multiclass(ytest, ypred)
    except:
        roc_auc = None
    return acc, f1, roc_auc

In [12]:
def get_avg_length_reward(df):
    length = np.mean(df.episode_length)
    reward = np.mean(df.reward)
    return length, reward

In [13]:
y_pred_df = test_df[test_df['y_pred'].notna()]
success_df = y_pred_df[y_pred_df['y_pred']== y_pred_df['y_actual']]
len(success_df)

1921

In [14]:
len(y_pred_df)

11431

In [15]:
y_pred_df.y_pred.value_counts()

1.0    1981
2.0    1964
5.0    1904
3.0    1879
4.0    1874
0.0    1829
Name: y_pred, dtype: int64

In [16]:
success_rate = len(success_df)/len(test_df)*100
success_rate

16.682587928788536

In [17]:
#avg length and return 
avg_length, avg_return = get_avg_length_reward(test_df)
avg_length, avg_return

(2.012852800694746, 0.06322188449848025)

In [18]:
acc, f1, roc_auc = test(y_pred_df['y_actual'], y_pred_df['y_pred'])
acc, f1, roc_auc

(0.1680517889948386, 0.13984042293307078, 0.5023438604450713)