In [1]:
import pandas as pd
import numpy as np
import random
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append('../..')
from modules.many_features import utils, constants
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report 
from sklearn.metrics import roc_curve, auc

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
#def sample_train_set(x, y, sample_num):
#    idx_list = random.sample(list(x.index), sample_num)
#    sampled_x = x.loc[idx_list]
#    sampled_y = y.loc[idx_list]
#    return np.array(sampled_x), np.array(sampled_y)

#### The Datasets

In [125]:
train_df = pd.read_csv('../../final/data/train_set_noisy_4_missing_3.csv')
train_df = train_df.fillna(-1)
X_train = train_df.iloc[:, 0:-1]
y_train = train_df.iloc[:, -1]
X_train.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat
0,11.786358,21.598977,-1.0,4.819682,-1.0,75.211769,242.69415,4.701269,0,1.491966,135.606446,123.227658,-1.0,-1.0,69.236844,-1.0,-1.0
1,9.765445,95.948185,-1.0,-1.0,473.81006,79.529629,-1.0,3.683701,1,0.756766,86.763989,-1.0,-1.0,-1.0,-1.0,29.296335,-1.0
2,10.581471,264.299544,4.47951,0.401345,232.378777,86.9622,-1.0,3.650369,1,0.887986,30.558329,-1.0,9.21239,-1.0,122.595648,31.744414,-1.0
3,8.455727,-1.0,-1.0,-1.0,408.449185,78.280967,-1.0,-1.0,0,1.273017,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,7.347738,61.963008,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22.043215,-1.0


In [126]:
test_df = pd.read_csv('../../final/data/test_set_constant.csv')
X_test = test_df.iloc[:, 0:-1]
y_test = test_df.iloc[:, -1]
X_test.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat
0,7.116363,-1.0,3.781573,2.738413,-1.0,95.904198,68.457895,2.226085,0,1.892912,39.80855,110.329197,64.40435,21.654404,73.787009,21.349089,-1.0
1,8.12532,92.230003,4.231419,1.188039,143.365567,104.057204,204.747831,2.342554,0,0.652614,13.478089,-1.0,32.705481,-1.0,43.520272,24.375961,142.815207
2,11.30945,38.324563,-1.0,-1.0,455.077909,76.402602,-1.0,4.440732,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,33.92835,-1.0
3,13.763858,253.513394,2.262606,0.551444,453.772884,82.781943,90.101466,4.987993,0,0.853521,104.005514,34.639227,0.963866,22.083012,88.891838,41.291574,19.856071
4,11.464002,-1.0,-1.0,-1.0,320.964653,104.287127,-1.0,3.297819,0,1.163516,121.616315,105.895897,-1.0,9.337462,-1.0,34.392007,-1.0


#### Some useful functions and variables

In [127]:
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

#### Decision Tree

In [128]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=constants.SEED).fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
test_df_dt = pd.DataFrame()
test_df_dt['y_actual'] = y_test
test_df_dt['y_pred'] = y_pred_dt
test_df_dt.isna().sum()

y_actual    0
y_pred      0
dtype: int64

In [129]:
success_rate_dt, success_df_dt = utils.success_rate(test_df_dt)
success_rate_dt

77.03571428571429

In [130]:
acc_dt, f1_dt, roc_auc_dt = utils.test(test_df_dt['y_actual'], test_df_dt['y_pred'])
acc_dt, f1_dt, roc_auc_dt

(0.7703571428571429, 0.7506663379246592, 0.8593844877213418)

#### Random Forest Classifier

In [131]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=constants.SEED).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
test_df_rf = pd.DataFrame()
test_df_rf['y_actual'] = y_test
test_df_rf['y_pred'] = y_pred_rf
test_df_rf.isna().sum()

y_actual    0
y_pred      0
dtype: int64

In [132]:
success_rate_rf, success_df_rf = utils.success_rate(test_df_rf)
success_rate_rf

90.73571428571428

In [133]:
acc_rf, f1_rf, roc_auc_rf = utils.test(test_df_rf['y_actual'], test_df_rf['y_pred'])
acc_rf, f1_rf, roc_auc_rf

(0.9073571428571429, 0.8542555585130018, 0.9343745384546015)

#### XGBoost

In [134]:
# y_train_xgb = numerize_labels(y_train)
# y_test_xgb = numerize_labels(y_test)

In [135]:
import xgboost as xgb
xg = xgb.XGBClassifier(random_state=constants.SEED).fit(X_train, y_train)
y_pred_xg = xg.predict(X_test)
test_df_xg = pd.DataFrame()
test_df_xg['y_actual'] = y_test
test_df_xg['y_pred'] = y_pred_xg
test_df_xg.isna().sum()

y_actual    0
y_pred      0
dtype: int64

In [136]:
success_rate_xg, success_df_xg = utils.success_rate(test_df_xg)
success_rate_xg

90.67857142857143

In [137]:
acc_xg, f1_xg, roc_auc_xg = utils.test(test_df_xg['y_actual'], test_df_xg['y_pred'])
acc_xg, f1_xg, roc_auc_xg

(0.9067857142857143, 0.8536391077170522, 0.9340031786188363)

#### Testing Rf and data with random zeros

In [138]:
# #data with random zeros
# X_train = np.loadtxt('data/zeros/X_train.txt', dtype=np.float32)
# #X_val = np.loadtxt('data/zeros/X_val.txt', dtype=np.float32)
# X_test = np.loadtxt('data/zeros/X_test.txt', dtype=np.float32)

# y_train = np.loadtxt('data/zeros/y_train.txt', dtype=int)
# #y_val = np.loadtxt('data/zeros/y_val.txt', dtype=int)
# y_test = np.loadtxt('data/zeros/y_test.txt', dtype=int)

In [139]:
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
# acc, f1, cr, cm, roc_auc, roc_auc2, y_pred  = test(rf, X_test, y_test) 
# print(f'Accuracy - {acc}, F1 Score Macro: {f1}, ROC AUC Score: {roc_auc}, ROC AUC 2: {roc_auc2}')
# print(f'Unique predicted classes: {np.unique(y_pred)}')