In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize, LabelBinarizer, MinMaxScaler

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from tqdm.notebook import tqdm

In [3]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    '''Calculate roc_auc score'''
    fig, c_ax = plt.subplots(1,1, figsize = (12, 8))
    target= list(class_dict.keys())
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    for (idx, c_label) in enumerate(target):
        fpr, tpr, thresholds = roc_curve(y_test[:,idx].astype(int), y_pred[:,idx])
        c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
    c_ax.plot(fpr, fpr, 'b-', label = 'Random Guessing')
    plt.close()
    return roc_auc_score(y_test, y_pred, average=average)

In [4]:
def get_class_distribution(obj):
    count_dict = {'A': 0, 'B': 0, 'C': 0}
    
    for i in obj:
        if i == 0: 
            count_dict['A'] += 1
        elif i == 1: 
            count_dict['B'] += 1
        elif i == 2: 
            count_dict['C'] += 1           
        else:
            print('Check classes')
            
    return count_dict

In [5]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [10]:
nn_df = pd.read_csv('data/dataset_10000.csv')

class_dict = {'A':0, 'B':1, 'C':2}
nn_df['label'].replace(class_dict, inplace=True)

X = nn_df.iloc[:, 0:-1]
y = nn_df.iloc[:, -1]

#X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
#X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train

array([[0.71428571, 0.07692308, 0.83673469],
       [0.85714286, 0.23076923, 0.33673469],
       [0.39285714, 0.69230769, 0.69387755],
       ...,
       [0.        , 0.30769231, 0.76530612],
       [0.5       , 0.30769231, 0.2244898 ],
       [0.        , 0.38461538, 0.44897959]])

In [11]:
#randomly replacing some values in dataset with 0s
def insert_zeros(arr):
    df = pd.DataFrame(arr)
    for col in df.columns:
        df[col] = df[col].sample(frac=0.5)
    df.fillna(0, inplace=True)
    #df.loc[ df.sample(frac=.5).index, 'value'] = 0
    #new_arr = arr.flat[np.random.choice([0], len(arr)//2 , replace=False)] = 0
    return np.array(df)

In [12]:
X_train = insert_zeros(X_train)
X_val = insert_zeros(X_val)
X_test = insert_zeros(X_test)
X_train

array([[0.        , 0.        , 0.        ],
       [0.85714286, 0.        , 0.        ],
       [0.        , 0.69230769, 0.69387755],
       ...,
       [0.        , 0.        , 0.76530612],
       [0.        , 0.        , 0.        ],
       [0.        , 0.38461538, 0.44897959]])

In [13]:
#saving to file
np.savetxt('data/zeros/X_train.txt', X_train, fmt='%d')
np.savetxt('data/zeros/X_val.txt', X_val, fmt='%d')
np.savetxt('data/zeros/X_test.txt', X_test, fmt='%d')

np.savetxt('data/zeros/y_train.txt', y_train, fmt='%d')
np.savetxt('data/zeros/y_val.txt', y_val, fmt='%d')
np.savetxt('data/zeros/y_test.txt', y_test, fmt='%d')

In [9]:
train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(np.array(y_train)).long())

val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())

test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(np.array(y_test)).long())

In [10]:
target_list = []
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list)

In [11]:
class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
print(class_weights)
###################### OUTPUT ######################tensor([0.1429, 0.0263, 0.0020, 0.0022, 0.0070, 0.0714])

tensor([0.0004, 0.0003, 0.0008])


In [12]:
class_weights_all = class_weights[target_list]

In [13]:
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [14]:
EPOCHS = 300
BATCH_SIZE = 16
LEARNING_RATE = 0.0007
NUM_FEATURES = len(X.columns)
NUM_CLASSES = 3

In [15]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=weighted_sampler)

val_loader = DataLoader(dataset=val_dataset, batch_size=1)

test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [16]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [18]:
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
model

MulticlassClassification(
  (layer_1): Linear(in_features=3, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [19]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [20]:
accuracy_stats = {
    'train': [],
    "val": []
}

loss_stats = {
    'train': [],
    "val": []
}

In [21]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
            
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

Begin training.


  0%|          | 0/300 [00:00<?, ?it/s]

Epoch 001: | Train Loss: 0.85879 | Val Loss: 1.04060 | Train Acc: 49.700| Val Acc: 42.125
Epoch 002: | Train Loss: 0.80408 | Val Loss: 1.02774 | Train Acc: 52.553| Val Acc: 44.250
Epoch 003: | Train Loss: 0.78021 | Val Loss: 0.99059 | Train Acc: 53.558| Val Acc: 46.500
Epoch 004: | Train Loss: 0.78467 | Val Loss: 0.99984 | Train Acc: 53.662| Val Acc: 43.875
Epoch 005: | Train Loss: 0.77324 | Val Loss: 1.01913 | Train Acc: 54.482| Val Acc: 43.750
Epoch 006: | Train Loss: 0.77668 | Val Loss: 0.93539 | Train Acc: 54.060| Val Acc: 46.000
Epoch 007: | Train Loss: 0.76765 | Val Loss: 0.97366 | Train Acc: 54.476| Val Acc: 45.875
Epoch 008: | Train Loss: 0.74609 | Val Loss: 1.00072 | Train Acc: 55.673| Val Acc: 45.375
Epoch 009: | Train Loss: 0.74669 | Val Loss: 0.93276 | Train Acc: 55.451| Val Acc: 46.875
Epoch 010: | Train Loss: 0.75164 | Val Loss: 0.95804 | Train Acc: 55.807| Val Acc: 47.750
Epoch 011: | Train Loss: 0.74224 | Val Loss: 0.96554 | Train Acc: 55.507| Val Acc: 46.250
Epoch 012:

Epoch 093: | Train Loss: 0.67268 | Val Loss: 0.88049 | Train Acc: 58.064| Val Acc: 50.250
Epoch 094: | Train Loss: 0.67802 | Val Loss: 0.91282 | Train Acc: 58.811| Val Acc: 48.500
Epoch 095: | Train Loss: 0.68423 | Val Loss: 0.90057 | Train Acc: 58.582| Val Acc: 49.000
Epoch 096: | Train Loss: 0.67548 | Val Loss: 0.89646 | Train Acc: 58.296| Val Acc: 48.875
Epoch 097: | Train Loss: 0.66432 | Val Loss: 0.91533 | Train Acc: 59.547| Val Acc: 49.625
Epoch 098: | Train Loss: 0.68135 | Val Loss: 0.90043 | Train Acc: 58.233| Val Acc: 49.375
Epoch 099: | Train Loss: 0.68076 | Val Loss: 0.88975 | Train Acc: 58.202| Val Acc: 49.250
Epoch 100: | Train Loss: 0.67271 | Val Loss: 0.90561 | Train Acc: 58.956| Val Acc: 50.125
Epoch 101: | Train Loss: 0.67430 | Val Loss: 0.88535 | Train Acc: 58.296| Val Acc: 49.750
Epoch 102: | Train Loss: 0.66216 | Val Loss: 0.91781 | Train Acc: 60.007| Val Acc: 48.875
Epoch 103: | Train Loss: 0.69146 | Val Loss: 0.92730 | Train Acc: 58.162| Val Acc: 49.125
Epoch 104:

Epoch 185: | Train Loss: 0.65357 | Val Loss: 0.87574 | Train Acc: 59.920| Val Acc: 50.375
Epoch 186: | Train Loss: 0.66437 | Val Loss: 0.97140 | Train Acc: 59.871| Val Acc: 46.750
Epoch 187: | Train Loss: 0.66391 | Val Loss: 0.89030 | Train Acc: 59.024| Val Acc: 48.875
Epoch 188: | Train Loss: 0.68344 | Val Loss: 0.90549 | Train Acc: 57.787| Val Acc: 48.375
Epoch 189: | Train Loss: 0.66226 | Val Loss: 0.95354 | Train Acc: 59.524| Val Acc: 46.625
Epoch 190: | Train Loss: 0.65906 | Val Loss: 0.90834 | Train Acc: 59.922| Val Acc: 49.250
Epoch 191: | Train Loss: 0.65095 | Val Loss: 0.90049 | Train Acc: 59.324| Val Acc: 50.125
Epoch 192: | Train Loss: 0.66736 | Val Loss: 0.93275 | Train Acc: 58.796| Val Acc: 49.500
Epoch 193: | Train Loss: 0.64491 | Val Loss: 0.90310 | Train Acc: 60.253| Val Acc: 49.750
Epoch 194: | Train Loss: 0.66277 | Val Loss: 0.89640 | Train Acc: 59.378| Val Acc: 50.625
Epoch 195: | Train Loss: 0.65675 | Val Loss: 0.93938 | Train Acc: 60.167| Val Acc: 49.500
Epoch 196:

Epoch 277: | Train Loss: 0.66067 | Val Loss: 0.90150 | Train Acc: 59.591| Val Acc: 48.375
Epoch 278: | Train Loss: 0.65100 | Val Loss: 0.90308 | Train Acc: 59.809| Val Acc: 48.375
Epoch 279: | Train Loss: 0.65059 | Val Loss: 0.92179 | Train Acc: 59.482| Val Acc: 48.625
Epoch 280: | Train Loss: 0.64721 | Val Loss: 0.90926 | Train Acc: 60.196| Val Acc: 49.750
Epoch 281: | Train Loss: 0.63391 | Val Loss: 0.92239 | Train Acc: 60.416| Val Acc: 49.750
Epoch 282: | Train Loss: 0.64890 | Val Loss: 0.94679 | Train Acc: 60.371| Val Acc: 48.125
Epoch 283: | Train Loss: 0.67303 | Val Loss: 0.91454 | Train Acc: 59.022| Val Acc: 49.375
Epoch 284: | Train Loss: 0.66537 | Val Loss: 0.91742 | Train Acc: 59.371| Val Acc: 48.000
Epoch 285: | Train Loss: 0.64777 | Val Loss: 0.95657 | Train Acc: 59.462| Val Acc: 47.500
Epoch 286: | Train Loss: 0.64310 | Val Loss: 0.90709 | Train Acc: 60.573| Val Acc: 49.250
Epoch 287: | Train Loss: 0.66075 | Val Loss: 0.93704 | Train Acc: 59.838| Val Acc: 48.000
Epoch 288:

In [22]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list[:5]

[2, 2, 2, 0, 0]

In [24]:
y_test

array([2, 0, 1, ..., 2, 1, 1], dtype=int64)

In [25]:
acc = accuracy_score(y_test, y_pred_list)
f1 = f1_score(y_test, y_pred_list, average='macro', labels=np.unique(y_test))
roc_auc = multiclass_roc_auc_score(y_test, y_pred_list)
acc, f1, roc_auc

(0.4875, 0.47288755840699054, 0.6907433246633445)