In [1]:
# Environment setup and module import
import torch
import time
from torch.utils import data
import torch.nn as nn
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
debug_encoding = True # if True, uses existing mini_train_encoded.csv file so this runs fast
save_data = False # default: False. This is for diagnostic purposes.  Remove later.

In [3]:
class Model(nn.Module):

    def __init__(self, numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.batch_norm_num = nn.BatchNorm1d(numerical_cols)
        all_layers = []
        input_size = numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_data):
        x_data = self.batch_norm_num(x_data)
        x = torch.cat([x_data], 1)
        x = self.layers(x)
        return x

In [4]:
# Import DF from CSV

if debug_encoding == True:
    dataset = 'data/mini_train_encoded.csv'
else:
    dataset = 'data/train_encoded.csv'

df = pd.read_csv(dataset)


In [5]:
# Remove hasdetections from the df.  Make it a new df for labels
label_df = pd.DataFrame()
label_df = df['HasDetections'].to_numpy()
df.drop(columns=['HasDetections'])

# Convert dtypes to be all identifical for pytorch
df.astype('float64').dtypes

# Iterate and collect all data values
cols = []
for c in df.columns:
    cols.append(df[c].values)
    
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)
output = torch.tensor(label_df)

print(data.shape)
print(output.shape)



torch.Size([312251, 150])
torch.Size([312251])


In [6]:
model = Model(data.shape[1], 2, [200,100,50], p=0.4)
print("Model details:")
print(model)

# Using CrossEntropyLoss because we effectively have an unbalanced training set
# In other words, not all inputs are normalized
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Model details:
Model(
  (batch_norm_num): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=150, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=2, bias=True)
  )
)


In [7]:
epochs = 300
aggregated_losses = []

print("Starting NN training.")
start = time.time()

for i in range(epochs):
    i += 1
    y_pred = model(data)
    single_loss = loss_function(y_pred, output)
    aggregated_losses.append(single_loss)

    if i%10 == 1:
        print(f'Training epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'Training epoch: {i:3} loss: {single_loss.item():10.10f}')
print("NN training complete.  Minutes elapsed:", (time.time()-start)/60)

Starting NN training.
Training epoch:   1 loss: 0.76705360
Training epoch:  11 loss: 0.33334520
Training epoch:  21 loss: 0.11907870
Training epoch:  31 loss: 0.05106500
Training epoch:  41 loss: 0.02711911
Training epoch:  51 loss: 0.01628671
Training epoch:  61 loss: 0.01104274
Training epoch:  71 loss: 0.00757197
Training epoch:  81 loss: 0.00564454
Training epoch:  91 loss: 0.00437213
Training epoch: 101 loss: 0.00335900
Training epoch: 111 loss: 0.00272951
Training epoch: 121 loss: 0.00229450
Training epoch: 131 loss: 0.00206917
Training epoch: 141 loss: 0.00184323
Training epoch: 151 loss: 0.00153360
Training epoch: 161 loss: 0.00137807
Training epoch: 171 loss: 0.00111983
Training epoch: 181 loss: 0.00105158
Training epoch: 191 loss: 0.00095014
Training epoch: 201 loss: 0.00090918
Training epoch: 211 loss: 0.00090220
Training epoch: 221 loss: 0.00080984
Training epoch: 231 loss: 0.00074440
Training epoch: 241 loss: 0.00071771
Training epoch: 251 loss: 0.00072337
Training epoch: 

In [57]:
# Import dev DF from CSV

if debug_encoding == True:
    dataset = 'data/mini_dev_encoded.csv'
else:
    dataset = 'data/dev_encoded.csv'

df_dev = pd.read_csv(dataset)


In [58]:
# Remove hasdetections from the df.  Make it a new df for labels
labels = df_dev['HasDetections'].to_numpy()
df_dev.drop(columns=['HasDetections'])

# Convert dtypes to be all identifical for pytorch
df_dev.astype('float64').dtypes

# Iterate and collect all data values
cols = []
for c in df_dev.columns:
    cols.append(df_dev[c].values)
    
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)

print(data.shape)
print(labels.shape)

torch.Size([66911, 150])
(66911,)


In [59]:
# make predictions
model.eval() # switch into eval mode
output = model(data)
predictions = np.argmax(output.cpu().data.numpy(), axis=1)
print(predictions.shape)

(66911,)


In [60]:
from sklearn import metrics
print("accuracy:", metrics.accuracy_score(y_true=labels, y_pred=predictions))
print(predictions[:10])
print(labels[:10])

accuracy: 0.9999551643227571
[0 1 1 0 1 0 1 1 1 0]
[0 1 1 0 1 0 1 1 1 0]


In [8]:
if save_data == True:
    print("To implement: predictions vs actual and NN weights, once we move to test data")