In [172]:
import pandas as pd
import numpy as np
import random
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [173]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [174]:
def sample_train_set(x, y, sample_num):
    idx_list = random.sample(list(x.index), sample_num)
    sampled_x = x.loc[idx_list]
    sampled_y = y.loc[idx_list]
    return np.array(sampled_x), np.array(sampled_y)

#### The Datasets

In [175]:
df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv') #my real dataset i think
#df = pd.read_csv('data/anemia_synth_dataset_hb.csv')
#df = pd.read_csv('data/noisy_dataset.csv')
df = df.fillna(0)
classes = list(df.label.unique())
nums = [i for i in range(len(classes))]
class_dict = dict(zip(classes, nums))
class_dict

{'No anemia': 0,
 'Hemolytic anemia': 1,
 'Aplastic anemia': 2,
 'Iron deficiency anemia': 3,
 'Vitamin B12/Folate deficiency anemia': 4,
 'Anemia of chronic disease': 5}

In [176]:
# X, y = df.drop(['label'], axis=1), df['label']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

df['label'] = df['label'].replace(class_dict)
print(df.label.value_counts())
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
X_train, y_train = np.array(X_train), np.array(y_train)
#X_train, y_train = sample_train_set(full_X_train, full_y_train, 1000)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

1    14146
0    10000
2     9450
5     1869
4     1575
3     1343
Name: label, dtype: int64


((26868, 6), (11515, 6), (26868,), (11515,))

#### Some useful functions and variables

In [177]:
#class_dict = {'A':0, 'B':1, 'C':2}
# classes = list(df.label.unique())
# nums = [i for i in range(len(classes))]
# class_dict = dict(zip(classes, nums))
# class_dict

In [178]:
def compute_feature_importance(model, x, verbose=False):
    importance = model.feature_importances_
    if verbose:
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
    feats = {} # a dict to hold feature_name: feature_importance
    for feature, importance in zip(x.columns, importance):
        feats[feature] = importance #add the name/value pair 

    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})
    importances.sort_values(by='Importance').plot(kind='bar', rot=90)

In [179]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    '''Calculate roc_auc score'''
    fig, c_ax = plt.subplots(1,1, figsize = (12, 8))
    target= list(class_dict.keys())
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    for (idx, c_label) in enumerate(target):
        fpr, tpr, thresholds = roc_curve(y_test[:,idx].astype(int), y_pred[:,idx])
        c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
    c_ax.plot(fpr, fpr, 'b-', label = 'Random Guessing')
    plt.close()
    return roc_auc_score(y_test, y_pred, average=average)

In [180]:
def multiclass(actual_class, pred_class, average = "macro"):

    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    print(f'Roc auc dict: {roc_auc_dict}')
    avg = sum(roc_auc_dict.values()) / len(roc_auc_dict)
    
    #return roc_auc_dict
    return avg

In [181]:
def test(model, Xtest, ytest):
    ypred = model.predict(Xtest)
    acc = accuracy_score(ytest, ypred)
    f1_macro = f1_score(ytest, ypred, average ='macro', labels=np.unique(ytest))
    #f1_micro = f1_score(ytest, ypred, average ='micro', labels=np.unique(ytest))
    cr = classification_report(ytest, ypred)
    cm = confusion_matrix(ytest, ypred)
    roc_auc = multiclass_roc_auc_score(ytest, ypred)
    roc_auc2 = multiclass(ytest, ypred)
    return acc, f1_macro, cr, cm, roc_auc, roc_auc2, ypred

#### Decision Tree classifier

In [182]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
import graphviz

In [183]:
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
joblib.dump(dt, 'models/baselines/decision_tree.joblib')
acc, f1, cr, cm, roc_auc, roc_auc2, y_pred  = test(dt, X_test, y_test) 
print(f'Accuracy - {acc}, F1 Score: {f1}, ROC AUC Score: {roc_auc}, ROC AUC 2: {roc_auc2}')
print(f'Unique predicted classes: {np.unique(y_pred)}')

Roc auc dict: {0: 1.0, 1: 0.9997937010039885, 2: 0.999647266313933, 3: 1.0, 4: 1.0, 5: 0.999108734402852}
Accuracy - 0.9997394702561876, F1 Score: 0.9997336276523091, ROC AUC Score: 0.999758283620129, ROC AUC 2: 0.999758283620129
Unique predicted classes: [0 1 2 3 4 5]


#### delete from here

In [152]:
len(test_dataset), len(y_test)

(11515, 11515)

In [153]:
cols = [i for i in df.columns if i != 'label']
test_dataset = pd.DataFrame(X_test, columns = cols)
test_dataset['label'] = y_test
test_dataset['predicted_label'] = y_pred
test_dataset.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label,predicted_label
0,2.427097,0.0,3.644838,0.0,305.819648,95.006486,1,1
1,5.847005,0.0,0.364588,0.0,0.0,87.13616,2,2
2,5.978975,0.0,2.274289,0.0,338.180977,83.886697,1,1
3,15.224254,0.0,0.0,0.0,361.625413,0.0,0,0
4,7.736022,30.522768,1.216256,0.0,0.0,80.355429,2,2


In [161]:
misdiagnosed_samples = test_dataset[test_dataset.label != test_dataset.predicted_label]
misdiagnosed_samples

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,label,predicted_label
1576,5.712431,141.747425,1.999754,0.0,278.420249,92.439027,2,1
2718,9.204358,0.0,1.99979,0.0,0.0,94.933238,2,1
7350,4.406195,2683.42382,2.180173,0.0,157.875506,79.998863,5,1


In [164]:
X_miss = np.array(misdiagnosed_samples.drop(['label', 'predicted_label'], axis=1))
y_miss = np.array(misdiagnosed_samples['label'])
X_miss

array([[5.71243120e+00, 1.41747425e+02, 1.99975383e+00, 0.00000000e+00,
        2.78420249e+02, 9.24390269e+01],
       [9.20435816e+00, 0.00000000e+00, 1.99978958e+00, 0.00000000e+00,
        0.00000000e+00, 9.49332380e+01],
       [4.40619532e+00, 2.68342382e+03, 2.18017289e+00, 0.00000000e+00,
        1.57875506e+02, 7.99988632e+01]])

In [166]:
from stable_baselines import DQN
from envs import SyntheticComplexHbEnv
dqn_model = DQN.load('models/synthentic_with_hb_some_nans_stable_dqn2e6.pkl')

Loading a model without an environment, this model cannot be trained until it has a valid environment.


In [168]:
def synthetic_dqn_eval(dqn_model):
    test_df = pd.DataFrame()

    env = SyntheticComplexHbEnv(X_miss, y_miss, random=False)
    #env = SyntheticComplexHbEnv(X_train, y_train, random=False)
    count=0

    try:
        while True:
            count+=1
            if count%5000==0:
                print(f'Count: {count}')
            obs, done = env.reset(), False
            while not done:
                action, _states = dqn_model.predict(obs, deterministic=True)
                obs, rew, done,info = env.step(action)
                #if (done==True) & (np.isfinite(info['y_pred'])):
                if done == True:
                    test_df = test_df.append(info, ignore_index=True)
                #print('....................TEST DF ....................')
                #if len(test_df) != 0:
                #    print(test_df.head())

    except StopIteration:
        print('Testing done.....')
    return test_df

test_df = synthetic_dqn_eval(dqn_model)
test_df.head()

Testing done.....


Unnamed: 0,episode_length,index,is_success,reward,terminated,trajectory,y_actual,y_pred
0,4.0,0.0,1.0,4.0,0.0,"[hemoglobin, mcv, ret_count, Aplastic anemia]",2.0,2.0
1,4.0,1.0,1.0,4.0,0.0,"[hemoglobin, mcv, ret_count, Aplastic anemia]",2.0,2.0
2,4.0,2.0,0.0,2.0,0.0,"[hemoglobin, mcv, ret_count, Hemolytic anemia]",5.0,1.0


#### end here

In [155]:
# dot_data = export_graphviz(dt, out_file='images/complex_dt.dot', 
#                            feature_names = list(X_train.columns),  
#                            class_names = list(class_dict.keys()),
#                            filled=True)
#Convert to image using dot -Tpng images/dt.dot -o images/dt.png 
#Make sure to have graphviz installed (windows executable file lie normal software)

#### Random Forest Classifier

In [95]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
acc, f1, cr, cm, roc_auc, roc_auc2, y_pred  = test(rf, X_test, y_test) 
print(f'Accuracy - {acc}, F1 Score Macro: {f1}, ROC AUC Score: {roc_auc}, ROC AUC 2: {roc_auc2}')
print(f'Unique predicted classes: {np.unique(y_pred)}')

Roc auc dict: {0: 1.0, 1: 0.9998624673359924, 2: 0.999647266313933, 3: 1.0, 4: 1.0, 5: 1.0}
Accuracy - 0.9998263135041251, F1 Score Macro: 0.9999019284314451, ROC AUC Score: 0.9999182889416542, ROC AUC 2: 0.9999182889416542
Unique predicted classes: [0 1 2 3 4 5]


#### XGBoost

In [13]:
def numerize_labels(pd_series):
    series_copy = pd_series.copy()
    series_copy = series_copy.map(class_dict)
    return series_copy

In [14]:
# y_train_xgb = numerize_labels(y_train)
# y_test_xgb = numerize_labels(y_test)

In [96]:
import xgboost as xgb
xg = xgb.XGBClassifier(random_state=42).fit(X_train, y_train)
acc, f1, cr, cm, roc_auc, roc_auc2, y_pred = test(xg, X_test, y_test) 
print(f'Accuracy - {acc}, F1 Score Macro: {f1}, ROC AUC Score: {roc_auc}, ROC AUC 2: {roc_auc2}')
print(f'Unique predicted classes: {np.unique(y_pred)}')

Roc auc dict: {0: 0.9999412800939518, 1: 0.9998624673359924, 2: 0.9994708994708994, 3: 1.0, 4: 1.0, 5: 1.0}
Accuracy - 0.9997394702561876, F1 Score Macro: 0.9998447348702512, ROC AUC Score: 0.9998791078168073, ROC AUC 2: 0.9998791078168073
Unique predicted classes: [0 1 2 3 4 5]


#### Testing Rf and data with random zeros

In [16]:
# #data with random zeros
# X_train = np.loadtxt('data/zeros/X_train.txt', dtype=np.float32)
# #X_val = np.loadtxt('data/zeros/X_val.txt', dtype=np.float32)
# X_test = np.loadtxt('data/zeros/X_test.txt', dtype=np.float32)

# y_train = np.loadtxt('data/zeros/y_train.txt', dtype=int)
# #y_val = np.loadtxt('data/zeros/y_val.txt', dtype=int)
# y_test = np.loadtxt('data/zeros/y_test.txt', dtype=int)

In [17]:
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
# acc, f1, cr, cm, roc_auc, roc_auc2, y_pred  = test(rf, X_test, y_test) 
# print(f'Accuracy - {acc}, F1 Score Macro: {f1}, ROC AUC Score: {roc_auc}, ROC AUC 2: {roc_auc2}')
# print(f'Unique predicted classes: {np.unique(y_pred)}')