In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from sklearn.metrics import roc_auc_score
from collections import OrderedDict 

In [2]:
# Seed random number generation
torch.manual_seed(66)
np.random.seed(66)

# Neural Net Model
### Fake Deep - CMS 155
I will first import the data.

In [3]:
df_train = pd.read_csv("./datasets/jeh_train_label-enc.csv")
# Rather than do real imputation,I just replace nans with zeros
df_train = df_train.fillna(-1)
df_train = df_train.drop(columns = ["STATE", "FIPS_CODE"])
df_train.head(5)

Unnamed: 0,id,LATITUDE,LONGITUDE,DISCOVERY_TIME,FIRE_SIZE,FIPS_NAME,SOURCE_REPORTING_UNIT_NAME,DATE,LABEL
0,0,38.205,-120.335,130.0,0.1,215,157,0,1
1,1,33.8131,-85.1043,1115.0,1.17,82,71,0,4
2,2,32.201,-82.4987,1600.0,0.07,130,71,0,2
3,3,32.5093,-81.7086,1215.0,4.4,19,71,0,4
4,4,33.663889,-116.171944,-1.0,0.2,215,14,0,2


In [4]:
df_test = pd.read_csv("./datasets/jeh_test_label-enc.csv")
# df_test["Unnamed: 0"] = df_test["Unnamed: 0"].values + 285382
df_test = df_test.fillna(-1)
df_test = df_test.drop(columns = ["STATE", "FIPS_CODE"])
df_test.head(5)

Unnamed: 0,id,LATITUDE,LONGITUDE,DISCOVERY_TIME,FIRE_SIZE,FIPS_NAME,SOURCE_REPORTING_UNIT_NAME,DATE
0,285382,34.346944,-117.442222,1605.0,0.2,158,145,0
1,285383,34.02039,-116.17997,1545.0,0.1,218,69,0
2,285384,38.068611,-120.276667,1200.0,0.1,196,170,0
3,285385,32.499971,-83.742573,-1.0,0.4,87,47,1
4,285386,32.92494,-114.99253,126.0,0.1,89,18,1


Here I separate the data randomly into training and testing sets, with a 75/25 split. 

In [5]:
D = len(df_train.columns[1:-1])
N_total = len(df_train)
N = int(0.75 * N_total)
N_test = N_total - N
X_predict = df_test.to_numpy(dtype = float)[:, 1:]
try:
    X_train = np.load("./jeh_checkpoints/x_train.npy")
    Y_train = np.load("./jeh_checkpoints/y_train.npy")
    X_test = np.load("./jeh_checkpoints/x_test.npy")
    Y_test = np.load("./jeh_checkpoints/y_test.npy")
except FileNotFoundError:
    test_indices = np.random.choice(list(range(N_total)), size=N_test, replace=False)
    X_train = np.zeros([N, D], dtype = float)
    Y_train = np.zeros(N, dtype = int)
    X_test = np.zeros([N_test, D], dtype = float)
    Y_test = np.zeros(N_test, dtype = int)
    j = 0
    k = 0
    for i in range(len(df_train)):
        if i not in test_indices:
            X_train[j, :] = df_train.iloc[i, 1:-1]
            Y_train[j] = df_train.iloc[i, -1]
            j += 1
        else:
            X_test[k, :] = df_train.iloc[i, 1:-1]
            Y_test[k] = df_train.iloc[i, -1]
            k += 1
    np.save("./jeh_checkpoints/x_train.npy", X_train)
    np.save("./jeh_checkpoints/y_train.npy", Y_train)
    np.save("./jeh_checkpoints/x_test.npy", X_test)
    np.save("./jeh_checkpoints/y_test.npy", Y_test)

Next I will apply a basic normalization to each column. I am careful to normalize everything using only the training data. 

In [6]:
for i in range(D):
    mu = np.mean(X_train[:, i])
    stddev = np.std(X_train[:, i])
    X_train[:, i] = (X_train[:, i] - mu ) / stddev
    X_test[:, i] = (X_test[:, i] - mu ) / stddev
    X_predict[:, i] = (X_predict[:, i] - mu ) / stddev

In [7]:
# We require Y_train and Y_test to be from 0-3, not 1-4
Y_train = Y_train - 1
Y_test = Y_test - 1

I actually need to onehot encode the labels to the data set. In effect, my neural net will have 4 output units and I want the labels to emulate this. 

In [8]:
C = len(np.unique(Y_train))
Y_train_oh = np.zeros([len(Y_train), C])
Y_test_oh = np.zeros([len(Y_test), C])
for i in range(len(Y_train)):
    y = Y_train[i]
    Y_train_oh[i, y] = 1
for i in range(len(Y_test)):
    y = Y_test[i]
    Y_test_oh[i, y] = 1

I need to write a dataset class in order to use this set with pytorch. This is totally barebones, but I don't need to worry about streaming the dataset off the hard drive to multiple cores, since I have the memory to just store the entire dataset on each core. 

In [9]:
class Dataset(torch.utils.data.Dataset):
    "Dataset object for pytorch."
    def __init__(self, X, Y):
        'Initialization'
        self.Y = Y.astype(float)
        self.X = X.astype(float)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.Y)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        x = self.X[index]
        y = self.Y[index]
        return x, y

Let's use this class to actually construct dataset objects. 

In [10]:
train_dataset = Dataset(X_train, Y_train_oh)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)

I will use my GPU to try and speed up the neural net. 

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
# When you are on a CUDA machine, this should print a CUDA device:
print(device)

cuda


I will import some helper functions that I wrote in problem set 4. 

In [33]:
def train_model(model):
    
    model.to(device)
    model.train()
    loss_val = 1
    epoch = 0
    while loss_val > 0.06:
        epoch += 1
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            # Erase accumulated gradients
            optimizer.zero_grad()
            
            # Forward pass
            output = model(data.float())

            # Calculate loss
            loss = loss_fn(output, target.float())

            # Backward pass
            loss.backward()

            # Weight update
            optimizer.step()
        
        loss_val = loss.item()
        # Track loss each epoch
        print('Train Epoch: %d  Loss: %.4f' % (epoch,  loss_val))

def get_train_err(model):
    model.eval()
    correct = 0
    train_error = 0
    train_loss = 0
    # Turning off automatic differentiation
    with torch.no_grad():
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            output = model(data.float())
            train_loss += loss_fn(output, target.float()).item() * len(target) # Sum up batch loss
            pred = output.argmax(dim=1, keepdim=False).cpu().numpy()  # Get the index of the max class score
            
            # Convert the target back from onehot encoding
            target = target.cpu().numpy()
            target = target[:, 1] + target[:, 2] * 2 + target[:, 3] * 3
            
            # Determine the accuracy of the classification
            correct += np.sum(pred==target)
            temp = roc_auc_score(target, 
                                 output.cpu(), 
                                 multi_class='ovr',
                                 labels=[0, 1, 2, 3]) 
            train_error += temp * len(target)
            
    train_loss /= len(train_loader.dataset)
    train_error /= len(train_loader.dataset)
    print("Average Training ROC AUC: %.3f"%train_error)
    print('Training set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
          (train_loss, correct, len(train_loader.dataset),
           100. * correct / len(train_loader.dataset)))
    
def get_test_err(model):
    # Putting layers like Dropout into evaluation mode
    model.eval()

    test_loss = 0
    correct = 0
    test_error = 0
    
    # Turning off automatic differentiation
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data.float())
            test_loss += loss_fn(output, target.float()).item() * len(target)  # Sum up batch loss
            pred = output.argmax(dim=1, keepdim=False).cpu().numpy()  # Get the index of the max class score
            
            # Convert the target back from onehot encoding
            target = target.cpu().numpy()
            target = target[:, 1] + target[:, 2] * 2 + target[:, 3] * 3
            
            # Determine the accuracy of the classification
            correct += np.sum(pred==target)
            test_error += roc_auc_score(target, 
                                        output.cpu(), 
                                        multi_class='ovr',
                                        labels=[0, 1, 2, 3]) * len(target)
            
    test_loss /= len(test_loader.dataset)
    test_error /= len(test_loader.dataset)
    print("Average Testing ROC AUC: %.3f"%test_error)
    print('Test set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
          (test_loss, correct, len(test_loader.dataset),
           100. * correct / len(test_loader.dataset)))

I will now try constructing and testing an **even deeper** neural net. 

In [28]:
height = 1000
narrow = 100
drop = 0.05
depth = 10
ordict = OrderedDict()
ordict['start'] = nn.Linear(D, narrow)
ordict['expand1'] = nn.Linear(narrow, height)
ordict['expand3'] = nn.Dropout(drop)

# Construct the bulk of the net
for i in range(depth):
    ordict['1-%i'%i] = nn.Linear(height, height)
    ordict['2-%i'%i] = nn.ReLU()
    ordict['3-%i'%i] = nn.Dropout(drop)
    
# Narrow the net and bring it down to the last few nodes
ordict['narrow1'] = nn.Linear(height, narrow)
ordict['narrow2'] = nn.ReLU()
ordict['narrow3'] = nn.Dropout(drop)
ordict['final-1'] = nn.Linear(narrow, C)
ordict['final-2'] = nn.Softmax(dim=1)

# Pack all the layers into the model
model = nn.Sequential(ordict)

optimizer = torch.optim.Adam(model.parameters(), lr=float(1e-4))
loss_fn = nn.SmoothL1Loss()
print(model)

Sequential(
  (start): Linear(in_features=7, out_features=100, bias=True)
  (expand1): Linear(in_features=100, out_features=1000, bias=True)
  (expand3): Dropout(p=0.05, inplace=False)
  (1-0): Linear(in_features=1000, out_features=1000, bias=True)
  (2-0): ReLU()
  (3-0): Dropout(p=0.05, inplace=False)
  (1-1): Linear(in_features=1000, out_features=1000, bias=True)
  (2-1): ReLU()
  (3-1): Dropout(p=0.05, inplace=False)
  (1-2): Linear(in_features=1000, out_features=1000, bias=True)
  (2-2): ReLU()
  (3-2): Dropout(p=0.05, inplace=False)
  (1-3): Linear(in_features=1000, out_features=1000, bias=True)
  (2-3): ReLU()
  (3-3): Dropout(p=0.05, inplace=False)
  (1-4): Linear(in_features=1000, out_features=1000, bias=True)
  (2-4): ReLU()
  (3-4): Dropout(p=0.05, inplace=False)
  (1-5): Linear(in_features=1000, out_features=1000, bias=True)
  (2-5): ReLU()
  (3-5): Dropout(p=0.05, inplace=False)
  (1-6): Linear(in_features=1000, out_features=1000, bias=True)
  (2-6): ReLU()
  (3-6): Dropou

Now I will train the model and record the error. 

In [29]:
train_model(model)
get_train_err(model)

Train Epoch: 1  Loss: 0.0827
Train Epoch: 2  Loss: 0.0802
Train Epoch: 3  Loss: 0.0849
Train Epoch: 4  Loss: 0.0524
Train Epoch: 5  Loss: 0.0609
Train Epoch: 6  Loss: 0.0725
Train Epoch: 7  Loss: 0.0646
Train Epoch: 8  Loss: 0.0531
Train Epoch: 9  Loss: 0.0631
Train Epoch: 10  Loss: 0.0597
Train Epoch: 11  Loss: 0.0588
Train Epoch: 12  Loss: 0.0635
Train Epoch: 13  Loss: 0.0591
Train Epoch: 14  Loss: 0.0533
Train Epoch: 15  Loss: 0.0608
Train Epoch: 16  Loss: 0.0463
Average Training ROC AUC: 0.785
Training set: Average loss: 0.0617, Accuracy: 136308/214036 (63.6846)


In [30]:
test_dataset = Dataset(X_test, Y_test_oh)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=True)
get_test_err(model)

Average Testing ROC AUC: 0.779
Test set: Average loss: 0.0622, Accuracy: 45279/71346 (63.4640)


Let's predict the testing error like I will have to later. This line is mostly here because I was having trouble with it further down the line. 

In [31]:
model.eval()
pred_max = np.zeros(len(X_test), dtype=int)
pred_data = np.zeros([len(X_test), 5])
test_dataset = Dataset(X_test, np.zeros(len(X_test)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1024, shuffle=False)
i = 0
with torch.no_grad():
    for data, _ in test_loader:
        data = data.to(device)
        num = len(data)
        output = model(data.float())
        pred_data[i:i+num, 1:] = output.cpu().numpy()
        pred_max[i:i+num] = output.argmax(dim=1, keepdim=False).cpu().numpy()
        i += num

roc_auc_score(Y_test, pred_data[:, 1:], multi_class='ovr')

0.778895960705726

This model looks good. Let's train it on everything. 

In [34]:
Y_oh = np.zeros([N_total, C])
X = np.zeros([N_total, D])
Y_oh[:N, :] = Y_train_oh
Y_oh[N:, :] = Y_test_oh
X[:N, :] = X_train
X[N:, :] = X_test
train_dataset = Dataset(X, Y_oh)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1024, shuffle=True)

# Redefine the model
model = nn.Sequential(ordict)
optimizer = torch.optim.Adam(model.parameters(), lr=float(1e-4))
loss_fn = nn.SmoothL1Loss()

# Retrain the model on all avaliable data and get the training error. 
train_model(model)

Train Epoch: 1  Loss: 0.0588


In [35]:
get_train_err(model)

Average Training ROC AUC: 0.802
Training set: Average loss: 0.0598, Accuracy: 185120/285382 (64.8674)


Now that the model is well trained, I will predict the labels and submit to kaggle. 

In [36]:
model.eval()
pred_data = np.zeros([len(X_predict), 5])
test_dataset = Dataset(X_predict, np.zeros(len(X_predict)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1024, shuffle=False)
i = 0
with torch.no_grad():
    for data, target in test_loader:
        data, _ = data.to(device), target.to(device)
        num = len(data)
        output = model(data.float())
        pred_data[i:i+num, 1:] = output.cpu().numpy()
        i += num
# append the id to each of the predictions
pred_data[:, 0] = df_test["id"].values
pred_df = pd.DataFrame(pred_data, columns = ["id", "P1", "P2", "P3", "P4"])
pred_df = pred_df.astype({'id': 'int'})
pred_df

Unnamed: 0,id,P1,P2,P3,P4
0,285382,0.044244,0.527328,0.145882,0.282546
1,285383,0.269531,0.442274,0.062724,0.225471
2,285384,0.701664,0.201726,0.022038,0.074573
3,285385,0.021629,0.397439,0.534408,0.046525
4,285386,0.024907,0.435079,0.189782,0.350231
...,...,...,...,...,...
73030,358412,0.016956,0.798848,0.097582,0.086614
73031,358413,0.014137,0.315036,0.106180,0.564646
73032,358414,0.061608,0.433696,0.089376,0.415319
73033,358415,0.065398,0.449002,0.123342,0.362258


In [37]:
pred_df.to_csv("./FakeDeep_Attempt8_NeuralNet.csv", index=False)