In [1]:
# Environment setup and module import
import torch
import time
import math
from torch.utils import data
import torch.nn as nn
import pandas as pd
import numpy as np



In [2]:
debug = False # if True, uses existing mini_train_encoded.csv file so this runs fast
save_data = False # default: False. This is for diagnostic purposes.  Remove later.

# parameters for NN training


if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_properties(device))
    chunk_size = 1000000 if not debug else 5 # run in meaningful chunks so that GPU doesn't run out of memory
    epochs = 300 # can do this in 90 min
else:
    device = torch.device('cpu')
    print("running on CPU")
    chunk_size = 10000000 # load all the data at once
    epochs = 100 # run shorter because it takes a lot longer



_CudaDeviceProperties(name='GeForce GTX 1080', major=6, minor=1, total_memory=8192MB, multi_processor_count=20)


In [3]:
class Model(nn.Module):

    def __init__(self, numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.batch_norm_num = nn.BatchNorm1d(numerical_cols).to(device)
        all_layers = []
        input_size = numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)
        
    def forward(self, x_data):
        x_data = self.batch_norm_num(x_data)
        x = torch.cat([x_data], 1)
        x = self.layers(x)
        return x

In [4]:
# Import DF from CSV

if debug == True:
    dataset = 'data/mini_train_encoded.csv'
else:
    dataset = 'data/train_encoded.csv'

df = pd.read_csv(dataset)

# Remove hasdetections from the df.  Make it a new df for labels
labels = df['HasDetections'].to_numpy()
df.drop(columns=['HasDetections'], inplace=True)

# Iterate and convert all data values

cols = []
for c in df.columns:
    df[c] = df[c].astype(np.float64) # Convert dtypes to be all identical for pytorch

print(f"Tain data: {len(df)} rows, {len(df.columns)} columns")


Tain data: 6245038 rows, 150 columns


In [5]:
model = Model(len(df.columns), 2, [180,50,50,20], p=0.4).to(device)
print("Model details:")
print(model)


# Using CrossEntropyLoss because we effectively have an unbalanced training set
# In other words, not all inputs are normalized
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Model details:
Model(
  (batch_norm_num): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=150, out_features=180, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=180, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=20, bias=True)
    (13): ReLU(inplace=True)
    (14): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dropout(p

In [6]:
#
# train
#

print("Starting NN training")
start = time.time()

for i in range(epochs):
    
    rows_left = len(df)
    current = 0
    chunk_num = math.ceil(rows_left/chunk_size)

    #
    # do every epoch in chunks
    #
    
    model.zero_grad()
    while rows_left > 0:
        rows = (chunk_size if rows_left >= chunk_size else rows_left)
        chunk = df[current:current+rows]
        label_chunk = labels[current:current+rows]
        
        current += rows
        rows_left -= rows
        
        print(".", end="")
        
        # grab columns and labels, push into device (possibly GPU)
        cols = [chunk[col].values for col in chunk.columns]
        data = np.stack(cols, 1)
        data = torch.tensor(data, dtype=torch.float).to(device)
        output = torch.tensor(label_chunk).to(device)
        
        prediction = model(data)
        single_loss = loss_function(prediction, output)
        single_loss /= chunk_num
        single_loss.backward()

    # after done with all chunks, process the step and reset the gradients
    optimizer.step()
    model.zero_grad()
      
    print(f' Epoch: {i+1:3}, loss: {single_loss.item()*chunk_num:1.3f},', end=' ')
    print(f'elapsed: {time.time()-start:5.0f} s')



Starting NN training
....... Epoch:   1, loss: 0.782, elapsed:    18 s
....... Epoch:   2, loss: 0.761, elapsed:    36 s
....... Epoch:   3, loss: 0.749, elapsed:    54 s
....... Epoch:   4, loss: 0.736, elapsed:    72 s
....... Epoch:   5, loss: 0.729, elapsed:    90 s
....... Epoch:   6, loss: 0.723, elapsed:   108 s
....... Epoch:   7, loss: 0.717, elapsed:   125 s
....... Epoch:   8, loss: 0.712, elapsed:   143 s
....... Epoch:   9, loss: 0.708, elapsed:   161 s
....... Epoch:  10, loss: 0.704, elapsed:   179 s
....... Epoch:  11, loss: 0.703, elapsed:   197 s
....... Epoch:  12, loss: 0.698, elapsed:   215 s
....... Epoch:  13, loss: 0.698, elapsed:   233 s
....... Epoch:  14, loss: 0.693, elapsed:   251 s
....... Epoch:  15, loss: 0.690, elapsed:   269 s
....... Epoch:  16, loss: 0.690, elapsed:   287 s
....... Epoch:  17, loss: 0.685, elapsed:   305 s
....... Epoch:  18, loss: 0.683, elapsed:   323 s
....... Epoch:  19, loss: 0.679, elapsed:   340 s
....... Epoch:  20, loss: 0.6

....... Epoch: 164, loss: 0.634, elapsed:  2922 s
....... Epoch: 165, loss: 0.634, elapsed:  2939 s
....... Epoch: 166, loss: 0.635, elapsed:  2957 s
....... Epoch: 167, loss: 0.635, elapsed:  2975 s
....... Epoch: 168, loss: 0.634, elapsed:  2992 s
....... Epoch: 169, loss: 0.634, elapsed:  3010 s
....... Epoch: 170, loss: 0.634, elapsed:  3028 s
....... Epoch: 171, loss: 0.635, elapsed:  3046 s
....... Epoch: 172, loss: 0.634, elapsed:  3063 s
....... Epoch: 173, loss: 0.634, elapsed:  3081 s
....... Epoch: 174, loss: 0.634, elapsed:  3099 s
....... Epoch: 175, loss: 0.634, elapsed:  3116 s
....... Epoch: 176, loss: 0.633, elapsed:  3134 s
....... Epoch: 177, loss: 0.634, elapsed:  3152 s
....... Epoch: 178, loss: 0.634, elapsed:  3170 s
....... Epoch: 179, loss: 0.633, elapsed:  3187 s
....... Epoch: 180, loss: 0.634, elapsed:  3205 s
....... Epoch: 181, loss: 0.634, elapsed:  3223 s
....... Epoch: 182, loss: 0.634, elapsed:  3240 s
....... Epoch: 183, loss: 0.633, elapsed:  3258 s


In [7]:
print(f"NN training complete. Minutes elapsed: {(time.time()-start)/60:.0f}")


NN training complete. Minutes elapsed: 90


In [8]:
# Phase two - model evaluation

model.eval() # switch into eval mode
model.cpu() # get model back into CPU space


Model(
  (batch_norm_num): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=150, out_features=180, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(180, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=180, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=20, bias=True)
    (13): ReLU(inplace=True)
    (14): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dropout(p=0.4, inplace=F

In [9]:
# Import dev DF from CSV

if debug == True:
    dataset = 'data/mini_dev_encoded.csv'
else:
    dataset = 'data/dev_encoded.csv'

test = pd.read_csv(dataset)

if "HasDetections" in test.columns:
    # Remove hasdetections from the df.  Make it a new df for labels
    test_labels = test['HasDetections'].to_numpy()
    test.drop(columns=['HasDetections'], inplace=True)

# Iterate and convert all data values

cols = []
for c in test.columns:
    test[c] = test[c].astype(np.float64) # Convert dtypes to be all identical for pytorch

print(f"Test data: {len(test)} rows, {len(test.columns)} columns")



Test data: 1338222 rows, 150 columns


In [10]:

# grab columns and labels, push into device (possibly GPU)
cols = [test[col].values for col in test.columns]
data = np.stack(cols, 1)
data = torch.tensor(data, dtype=torch.float)
print(data.shape)

output = model(data)
print(output.shape)
predictions = np.argmax(output.cpu().data.numpy(), axis=1)


torch.Size([1338222, 150])
torch.Size([1338222, 2])


In [11]:
print(predictions.shape)
test_labels.shape

(1338222,)


(1338222,)

In [12]:
from sklearn import metrics
print("accuracy:", metrics.accuracy_score(y_true=test_labels, y_pred=predictions))
print(predictions[:30])
print(labels[:30])

accuracy: 0.6480158000690468
[1 1 0 0 1 0 0 1 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 1 1 0 1]
[0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1 1 0 1]
