In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
with open('training_data.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
type(data)

dict

In [4]:
data.keys()

dict_keys(['X', 'Y'])

In [5]:
X = data['X']

In [6]:
Y = data['Y']

In [7]:
type(X), type(Y)

(numpy.ndarray, numpy.ndarray)

In [8]:
X.shape, Y.shape

((6000, 502), (6000, 1))

In [9]:
Y.size

6000

In [10]:
np.unique(Y, return_counts=True)

(array([0, 1, 5], dtype=int64), array([2000, 2000, 2000], dtype=int64))

In [11]:
Y[Y > 1] = 2
Y = Y.flatten()
Y = pd.get_dummies(Y)
Y =  Y.to_numpy()

In [12]:
Y.shape

(6000, 3)

In [13]:
import string
from tqdm import tqdm
import random
import torch
import torch.nn as nn
from torch.nn import ReLU, Tanh,Sigmoid
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, IterableDataset, DataLoader

In [14]:
class ClassifierDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
        
    def __len__ (self):
        return len(self.Y)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
#70% train, 20% validation, 10% test
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.20, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.125, random_state=42)

In [17]:
X_train[:5]

array([[0.8897189 , 0.89118385, 0.8985018 , ..., 0.6807071 , 0.676409  ,
        0.6761962 ],
       [0.8698481 , 0.8754975 , 0.8797764 , ..., 0.6197327 , 0.61862373,
        0.6150738 ],
       [0.76998335, 0.77002126, 0.76289666, ..., 0.5952492 , 0.5978094 ,
        0.595771  ],
       [0.70036554, 0.6966747 , 0.6959275 , ..., 0.4321368 , 0.4293894 ,
        0.42774132],
       [0.6525331 , 0.6589684 , 0.6559166 , ..., 0.2711771 , 0.2698329 ,
        0.2713656 ]], dtype=float32)

In [18]:
Y_train[:5]

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0]], dtype=uint8)

In [19]:
train_data = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).long())
val_data = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(Y_val).long())
test_data = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(Y_test).long())

In [20]:
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.0001
NUM_CLASS = 3

In [21]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size = BATCH_SIZE)
val_loader = torch.utils.data.DataLoader(val_data, batch_size = BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = BATCH_SIZE)

In [22]:
class Net(nn.Module):
    def __init__(self, num_feature, num_class):
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
      
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer_out(x)

        return x

model = Net(num_feature = X.shape[1], num_class=NUM_CLASS)
print(model)

Net(
  (layer_1): Linear(in_features=502, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [24]:
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model) 

Net(
  (layer_1): Linear(in_features=502, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [25]:
def prediction_acc(y_pred, y_true):
    '''Returns prediction accuracy for a batch of output'''
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    

    correct_pred = (y_pred_tags == y_true).float()
    acc = correct_pred.sum() / len(correct_pred)
    acc = acc * 100
    return acc

In [26]:
#Dictionary to store accuracy and loss stats
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

In [27]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    
    for i, data in enumerate(train_loader):
        X_train_batch, y_train_batch = data
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        labels = torch.max(y_train_batch, 1)[1]
        optimizer.zero_grad()

        y_train_pred = model(X_train_batch)

        train_loss = criterion(y_train_pred, labels)
        train_acc = prediction_acc(y_train_pred, labels)

        train_loss.backward()
        optimizer.step()

        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        #print(train_acc.item())


    # Validation    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            labels = torch.max(y_val_batch, 1)[1]
            
            y_val_pred = model(X_val_batch)
            #print(y_val_pred)
                        
            val_loss = criterion(y_val_pred, labels)
            val_acc = prediction_acc(y_val_pred, labels)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
            #print(val_acc.item())
            
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(test_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(test_loader))
                              
    #print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f}' )
    print(f'Val Loss: {val_epoch_loss/len(test_loader):.5f}' ) 
    print(f'Train Acc: {train_epoch_acc/len(train_loader):.3f}' )
    print(f'Val Acc: {val_epoch_acc/len(test_loader):.3f}')
    
    
MODEL_PATH = 'model.pth'
torch.save(model, MODEL_PATH)

  0%|                                                                                                                                                                  | 0/10 [00:00<?, ?it/s]

Begin training.


 10%|███████████████▍                                                                                                                                          | 1/10 [00:01<00:13,  1.45s/it]

Epoch 001: | Train Loss: 0.50413
Val Loss: 0.62140
Train Acc: 91.832
Val Acc: 190.000


 20%|██████████████████████████████▊                                                                                                                           | 2/10 [00:01<00:08,  1.10s/it]

Epoch 002: | Train Loss: 0.24582
Val Loss: 0.26437
Train Acc: 99.929
Val Acc: 190.000


 30%|██████████████████████████████████████████████▏                                                                                                           | 3/10 [00:02<00:06,  1.16it/s]

Epoch 003: | Train Loss: 0.16896
Val Loss: 0.20115
Train Acc: 100.000
Val Acc: 190.000


 40%|█████████████████████████████████████████████████████████████▌                                                                                            | 4/10 [00:02<00:04,  1.46it/s]

Epoch 004: | Train Loss: 0.12687
Val Loss: 0.13996
Train Acc: 100.000
Val Acc: 190.000


 50%|█████████████████████████████████████████████████████████████████████████████                                                                             | 5/10 [00:02<00:02,  1.77it/s]

Epoch 005: | Train Loss: 0.10034
Val Loss: 0.10732
Train Acc: 100.000
Val Acc: 190.000


 60%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 6/10 [00:02<00:01,  2.07it/s]

Epoch 006: | Train Loss: 0.08203
Val Loss: 0.09256
Train Acc: 100.000
Val Acc: 190.000


 70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 7/10 [00:03<00:01,  2.37it/s]

Epoch 007: | Train Loss: 0.06803
Val Loss: 0.06606
Train Acc: 100.000
Val Acc: 190.000


 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 8/10 [00:03<00:00,  2.63it/s]

Epoch 008: | Train Loss: 0.05649
Val Loss: 0.06369
Train Acc: 100.000
Val Acc: 190.000


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 9/10 [00:03<00:00,  2.86it/s]

Epoch 009: | Train Loss: 0.04887
Val Loss: 0.05314
Train Acc: 100.000
Val Acc: 190.000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.49it/s]

Epoch 010: | Train Loss: 0.04131
Val Loss: 0.03894
Train Acc: 100.000
Val Acc: 190.000





In [61]:
def predict(model, test_data):
    '''Function to predict given test data'''
    return predict_batch(model, test_data)

In [78]:
def predict_batch(model, test_batch):
    '''Function to predict for a batch'''
    Y_test = np.zeros((test_batch.shape[0],1)) #Dummy label to pass to dataloader
    test_data = ClassifierDataset(torch.from_numpy(test_batch).float(), torch.from_numpy(Y_test).long())
    test_loader = torch.utils.data.DataLoader(test_data, batch_size = BATCH_SIZE)
    
    with torch.no_grad():
        model.eval()
        pred = []
        for x_batch,_ in test_loader:
            #x_test = x_batch.to(device)
            x_test = x_batch.to('cpu')
            y_test_pred = model(x_test)
            y_test_pred = torch.log_softmax(y_test_pred, dim = 1)
            _, prediction = torch.max(y_test_pred, dim = 1)
            prediction = prediction.detach().cpu().numpy()
            pred.append(prediction)
            
        pred= [i for sublist in pred for i in sublist]
        

    return np.array(pred)

In [42]:
Y_test_pred = predict_batch(model, X_test)

In [43]:
Y_test = np.argmax(Y_test, axis=-1) #True prediction

In [44]:
Y_test_pred.shape, Y_test.shape

((600,), (600,))

In [45]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [46]:
accuracy_score(Y_test_pred, Y_test)

1.0

In [47]:
Y_test_pred[441:450],Y_test[441:450]

(array([2, 0, 2, 2, 2, 1, 1, 1, 1], dtype=int64),
 array([2, 0, 2, 2, 2, 1, 1, 1, 1], dtype=int64))

In [64]:
x1 = np.random.rand(1,502)
predict(model, x1)

array([2], dtype=int64)

In [67]:
x1 = np.random.rand(10,502)
predict(model, x1)

array([2, 2, 2, 2, 1, 2, 2, 2, 2, 2], dtype=int64)

In [77]:
## RUN TIME on GPU

import timeit
model_cpu = model.to(device)

start = timeit.default_timer()
x1 = np.random.rand(10000,502)
predict(model, x1)

stop = timeit.default_timer()

print('Run Time: ', stop - start)  

Run Time:  0.2619105999999647


In [79]:
## RUN TIME on CPU

import timeit

model_cpu = model.to('cpu')
start = timeit.default_timer()


x1 = np.random.rand(10000,502)
predict_batch(model_cpu, x1)

stop = timeit.default_timer()

print('Run Time: ', stop - start)  

Run Time:  0.268890699999929


# Trying RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#70% train, 20% validation, 10% test
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.20, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.125, random_state=42)

In [None]:
#(n_estimators=500,  bootstrap = True, min_samples_split=10)
clf = RandomForestClassifier(n_estimators=500,  bootstrap = True, min_samples_split=10)
# Fit on training data
clf.fit(X_train, Y_train)

In [None]:
Y_val_pred=clf.predict(X_val)
print("\n\n\nRandom Forest Accuracy:",accuracy_score(Y_val_pred, Y_val))
print('Recall : ', recall_score(Y_val_pred, Y_val, average='macro'))
print('Prescision : ', precision_score(Y_val_pred, Y_val, average='macro'))

In [None]:
Y_test_pred=clf.predict(X_test)
print("\n\n\nRandom Forest Accuracy:",accuracy_score(Y_test_pred, Y_test))
print('Recall : ', recall_score(Y_test_pred, Y_test, average='macro'))
print('Prescision : ', precision_score(Y_test_pred, Y_test, average='macro'))