In [15]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize, LabelBinarizer
import sys
sys.path.append('../..')
from modules.many_features import utils, constants

In [16]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

#### The Datasets

In [17]:
train_df = pd.read_csv('../../final/data/train_set_basic.csv')
train_df = train_df.fillna(-1)
X_train = train_df.iloc[:, 0:-1]
y_train = train_df.iloc[:, -1]
X_train.head()

Unnamed: 0,hemoglobin,ferritin,ret_count,segmented_neutrophils,tibc,mcv,serum_iron,rbc,gender,creatinine,cholestrol,copper,ethanol,folate,glucose,hematocrit,tsat
0,9.007012,-1.0,-1.0,3.519565,440.499323,103.442762,59.017997,2.612173,1,0.650757,114.794964,112.308159,25.612786,5.96971,116.026042,27.021037,13.397977
1,8.760976,-1.0,0.491469,-1.0,259.895852,103.885481,-1.0,2.529991,0,0.728641,74.824352,-1.0,-1.0,-1.0,-1.0,26.282929,-1.0
2,7.490324,70.812609,-1.0,1.495604,482.109919,79.543391,-1.0,2.824995,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22.470972,-1.0
3,8.11337,499.313483,3.507612,0.0,195.351883,100.545858,203.895756,2.420797,1,0.860863,9.120947,41.131511,1.84389,18.845473,106.199806,24.340111,104.373581
4,13.935301,349.569415,5.190725,6.894195,489.595939,102.234294,150.085853,4.089225,1,0.216907,20.344863,92.547095,19.815123,29.543875,98.38871,41.805903,30.655044


In [18]:
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 17), (14000, 17), (56000,), (14000,))

In [19]:
y_test

array([5, 1, 4, ..., 2, 4, 6])

#### Some useful functions and variables

In [20]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    '''Calculate roc_auc score'''
    fig, c_ax = plt.subplots(1,1, figsize = (12, 8))
    target= list(class_dict.keys())
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    for (idx, c_label) in enumerate(target):
        fpr, tpr, thresholds = roc_curve(y_test[:,idx].astype(int), y_pred[:,idx])
        c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
    c_ax.plot(fpr, fpr, 'b-', label = 'Random Guessing')
    plt.close()
    return roc_auc_score(y_test, y_pred, average=average)

In [21]:
def multiclass(actual_class, pred_class, average = "macro"):

    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
    print(f'Roc auc dict: {roc_auc_dict}')
    avg = sum(roc_auc_dict.values()) / len(roc_auc_dict)
    
    #return roc_auc_dict
    return avg

In [22]:
def test(model, Xtest, ytest):
    ypred = model.predict(Xtest)
    acc = accuracy_score(ytest, ypred)
    f1_macro = f1_score(ytest, ypred, average ='macro', labels=np.unique(ytest))
    #f1_micro = f1_score(ytest, ypred, average ='micro', labels=np.unique(ytest))
    cr = classification_report(ytest, ypred)
    cm = confusion_matrix(ytest, ypred)
    roc_auc = multiclass_roc_auc_score(ytest, ypred)
    roc_auc2 = multiclass(ytest, ypred)
    return acc, f1_macro, cr, cm, roc_auc, roc_auc2, ypred

#### Normalizing X

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
mmc = MinMaxScaler()

In [25]:
X_train_norm = mmc.fit_transform(X_train)
X_test_norm = mmc.transform(X_test)

In [26]:
X_train_norm.shape, X_test_norm.shape

((56000, 17), (14000, 17))

#### Support Vector Machine

In [27]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', C=1, decision_function_shape='ovo', random_state=constants.SEED).fit(X_train_norm, y_train)
y_pred_svm = svm_model.predict(X_test_norm)
test_df_svm = pd.DataFrame()
test_df_svm['y_actual'] = y_test
test_df_svm['y_pred'] = y_pred_svm
test_df_svm.isna().sum()

y_actual    0
y_pred      0
dtype: int64

In [28]:
success_rate_svm, success_df_svm = utils.success_rate(test_df_svm)
success_rate_svm

83.77142857142857

In [29]:
acc_svm, f1_svm, roc_auc_svm = utils.test(test_df_svm['y_actual'], test_df_svm['y_pred'])
acc_svm, f1_svm, roc_auc_svm

(0.8377142857142857, 0.802305706047073, 0.8963841821696324)

#### Feed-forward Neural Network - Keras

In [30]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras import backend as K
import keras

ModuleNotFoundError: No module named 'keras'

In [None]:
#keras.utils.set_random_seed(42)

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def numerize_labels(pd_series):
    series_copy = pd_series.copy()
    series_copy = series_copy.map(class_dict)
    return series_copy

y_train_num = numerize_labels(y_train)
y_test_num = numerize_labels(y_test)

dummy_y_train = np_utils.to_categorical(y_train_num)
dummy_y_test = np_utils.to_categorical(y_test_num)
dummy_y_train.shape, dummy_y_test.shape

In [None]:
model = Sequential()
model.add(Dropout(0.5, input_shape=(6,)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', f1_m, precision_m, recall_m])
history = model.fit(X_train_norm, dummy_y_train, nb_epoch=100, batch_size=1024, shuffle=True, validation_split=0.20, verbose=0)
#model.save('../models/baselines/synthentic_nn.h5')
loss, accuracy, f1, precision, recall = model.evaluate(X_test_norm, dummy_y_test, verbose=0)
loss, accuracy, f1, precision, recall

In [None]:
#Alternatively
y_pred1 = model.predict(X_test_norm)
y_pred = np.argmax(y_pred1, axis=1)
np.unique(y_pred)

In [None]:
np.unique(y_test_num)

In [None]:
acc = accuracy_score(y_test_num, y_pred)
f1 = f1_score(y_test_num, y_pred, average='macro', labels=np.unique(y_test_num))
roc_auc = multiclass_roc_auc_score(y_test_num, y_pred)
acc, f1, roc_auc

#### Feed-forward Neural Network - PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from tqdm.notebook import tqdm

In [None]:
def get_class_distribution(obj):
    #count_dict = {'A': 0, 'B': 0, 'C': 0}
    count_dict = {'No anemia': 0, 'Hemolytic anemia': 0, 'Aplastic anemia': 0, 'Iron deficiency anemia': 0,
                  'Vitamin B12/Folate deficiency anemia': 0, 'Anemia of chronic disease': 0}
    
    for i in obj:
        if i == 0: 
            count_dict['No anemia'] += 1
        elif i == 1: 
            count_dict['Hemolytic anemia'] += 1
        elif i == 2: 
            count_dict['Aplastic anemia'] += 1
        elif i == 3: 
            count_dict['Iron deficiency anemia'] += 1
        elif i == 4: 
            count_dict['Vitamin B12/Folate deficiency anemia'] += 1
        elif i == 5: 
            count_dict['Anemia of chronic disease'] += 1
        else:
            print('Check classes')
            
    return count_dict

In [None]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
nn_df = pd.read_csv('data/anemia_synth_dataset_hb_some_nans.csv')

class_dict = {'No anemia': 0, 'Hemolytic anemia': 1, 'Aplastic anemia': 2, 'Iron deficiency anemia': 3, 
              'Vitamin B12/Folate deficiency anemia': 4, 'Anemia of chronic disease': 5}

nn_df['label'].replace(class_dict, inplace=True)

X = nn_df.iloc[:, 0:-1]
y = nn_df.iloc[:, -1]

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=42)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [None]:
train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(np.array(y_train)).long())

val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())

test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(np.array(y_test)).long())

In [None]:
target_list = []
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list)

In [None]:
class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
print(class_weights)
###################### OUTPUT ######################tensor([0.1429, 0.0263, 0.0020, 0.0022, 0.0070, 0.0714])

In [None]:
class_weights_all = class_weights[target_list]

In [None]:
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [None]:
#EPOCHS = 300
EPOCHS = 11
BATCH_SIZE = 16
LEARNING_RATE = 0.0007
NUM_FEATURES = len(X.columns)
NUM_CLASSES = 6

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=weighted_sampler)

val_loader = DataLoader(dataset=val_dataset, batch_size=1)

test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [None]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
model

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
accuracy_stats = {
    'train': [],
    "val": []
}

loss_stats = {
    'train': [],
    "val": []
}

In [None]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
            
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

In [None]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list[:5]

In [None]:
#11 epochs
acc = accuracy_score(y_test, y_pred_list)
f1 = f1_score(y_test, y_pred_list, average='macro', labels=np.unique(y_test))
roc_auc = multiclass_roc_auc_score(y_test, y_pred_list)
acc, f1, roc_auc

In [None]:
#300 epochs
acc = accuracy_score(y_test, y_pred_list)
f1 = f1_score(y_test, y_pred_list, average='macro', labels=np.unique(y_test))
roc_auc = multiclass_roc_auc_score(y_test, y_pred_list)
acc, f1, roc_auc

In [None]:
np.unique(y_test)

In [None]:
y_test