In [1]:
# Environment setup and module import
import torch
import time
from torch.utils import data
import torch.nn as nn
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
debug_encoding = True # if True, uses existing mini_train_encoded.csv file so this runs fast
save_data = False # default: False. This is for diagnostic purposes.  Remove later.

In [3]:
class Model(nn.Module):

    def __init__(self, numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.batch_norm_num = nn.BatchNorm1d(numerical_cols)
        all_layers = []
        input_size = numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_data):
        x_data = self.batch_norm_num(x_data)
        x = torch.cat([x_data], 1)
        x = self.layers(x)
        return x

In [4]:
# Import DF from CSV

if debug_encoding == True:
    dataset = 'data/mini_train_encoded.csv'
else:
    dataset = 'data/train_encoded.csv'

df = pd.read_csv(dataset)


In [5]:
# Remove hasdetections from the df.  Make it a new df for labels
label_df = pd.DataFrame()
label_df = df['HasDetections'].to_numpy()

df.drop(columns=['HasDetections'], inplace=True)

# Iterate and collect all data values
cols = []
for c in df.columns:
    df[c] = df[c].astype(np.float64) # Convert dtypes to be all identical for pytorch
    cols.append(df[c].values)
    
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)
output = torch.tensor(label_df)

print(data.shape)
print(output.shape)



torch.Size([312251, 150])
torch.Size([312251])


In [6]:
model = Model(data.shape[1], 2, [180,150,100,20], p=0.4)
print("Model details:")
print(model)

# Using CrossEntropyLoss because we effectively have an unbalanced training set
# In other words, not all inputs are normalized
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Model details:
Model(
  (batch_norm_num): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=150, out_features=180, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=180, out_features=150, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=150, out_features=100, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=100, out_features=20, bias=True)
    (13): ReLU(inplace=True)
    (14): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dro

In [None]:
epochs = 81
aggregated_losses = []

print("Starting NN training.")
start = time.time()

for i in range(epochs):
    i += 1
    y_pred = model(data)
    single_loss = loss_function(y_pred, output)
    aggregated_losses.append(single_loss)

    if i%10 == 1:
        print(f'Training epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'Training epoch: {i:3} loss: {single_loss.item():10.10f}')
print("NN training complete.  Minutes elapsed:", (time.time()-start)/60)

Starting NN training.
Training epoch:   1 loss: 0.78131336


In [None]:
# Import dev DF from CSV

if debug_encoding == True:
    dataset = 'data/mini_test_encoded.csv'
else:
    dataset = 'data/test_encoded.csv'

df_dev = pd.read_csv(dataset)


In [None]:
# Remove hasdetections from the df.  Make it a new df for labels
labels = df_dev['HasDetections'].to_numpy()
df_dev.drop(columns=['HasDetections'], inplace=True)

# Iterate and collect all data values
cols = []
for c in df_dev.columns:
    df_dev[c] = df_dev[c].astype(np.float64) # Convert dtypes to be all identical for pytorch
    cols.append(df_dev[c].values)
    
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)

print(data.shape)
print(labels.shape)

In [None]:
# make predictions
model.eval() # switch into eval mode
output = model(data)
predictions = np.argmax(output.cpu().data.numpy(), axis=1)
print(predictions.shape)

In [None]:
from sklearn import metrics
print("accuracy:", metrics.accuracy_score(y_true=labels, y_pred=predictions))
print(predictions[:30])
print(labels[:30])

In [None]:
if save_data == True:
    print("To implement: predictions vs actual and NN weights, once we move to test data")