In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from sklearn.metrics import roc_auc_score
from collections import OrderedDict 
import bokeh, bokeh.io, bokeh.plotting, bokeh.layouts
import os
import urllib.request
bokeh.io.output_notebook()

# Neural Net Model Construction
### Fake Deep - CMS 155
### John Heath, Julia Sloan, Ayooluwa Odemuyiwa, Randall Pulido

In this notebook, I will primarily experiment via learning with different neural net architectures. Part of the reason I wanted to do this was for myself, so that I understood how to construct neural networks in tensorflow for custom datasets, and also so that I understood how to manage the data stored on the GPU. I hope this serves as a resource for myself and others illustrating how to efficiently run pytorch nerual nets on custom data. Notably, this is not the model that yielded the best performance, but rather the model with which I did the most interesting work. 

I will first download the data and then seed the random number generators and import the data. Each of the features were label encoded beforehand, and to finish the pre-processing I simply replace the NaNs with zeros. 

In [2]:
!mkdir datasets
!mkdir jeh_checkpoints

In [3]:
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/datasets/jeh_test_label-enc.csv', 
                           './datasets/jeh_test_label-enc.csv')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/datasets/jeh_train_label-enc.csv', 
                           './datasets/jeh_train_label-enc.csv')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/x_test.npy', 
                           './jeh_checkpoints/x_test.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/y_test.npy', 
                           './jeh_checkpoints/y_test.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/x_train.npy', 
                           './jeh_checkpoints/x_train.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/y_train.npy', 
                           './jeh_checkpoints/y_train.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/test_acc.npy', 
                           './jeh_checkpoints/test_acc.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/test_auc.npy', 
                           './jeh_checkpoints/test_auc.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/test_loss.npy', 
                           './jeh_checkpoints/test_loss.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/train_acc.npy', 
                           './jeh_checkpoints/train_acc.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/train_auc.npy', 
                           './jeh_checkpoints/train_auc.npy')
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/heathjohn62/CS155-Fake-Deep/main/project1/jeh_checkpoints/train_loss.npy', 
                           './jeh_checkpoints/train_loss.npy')

('./jeh_checkpoints/train_loss.npy',
 <http.client.HTTPMessage at 0x7fd265a02668>)

In [4]:
# Seed random number generation
torch.manual_seed(66)
np.random.seed(66)

# Import data
df_train = pd.read_csv("./datasets/jeh_train_label-enc.csv")
# Rather than do real imputation,I just replace nans with zeros
df_train = df_train.fillna(0)
df_train.head(5)

Unnamed: 0,id,LATITUDE,LONGITUDE,STATE,DISCOVERY_TIME,FIRE_SIZE,FIPS_NAME,FIPS_CODE,SOURCE_REPORTING_UNIT_NAME,DATE,LABEL
0,0,38.205,-120.335,0,130.0,0.1,215,0.0,157,0,1
1,1,33.8131,-85.1043,1,1115.0,1.17,82,143.0,71,0,4
2,2,32.201,-82.4987,1,1600.0,0.07,130,209.0,71,0,2
3,3,32.5093,-81.7086,1,1215.0,4.4,19,31.0,71,0,4
4,4,33.663889,-116.171944,0,0.0,0.2,215,0.0,14,0,2


In [5]:
df_test = pd.read_csv("./datasets/jeh_test_label-enc.csv")
df_test = df_test.fillna(0)
df_test.head(5)

Unnamed: 0,id,LATITUDE,LONGITUDE,STATE,DISCOVERY_TIME,FIRE_SIZE,FIPS_NAME,FIPS_CODE,SOURCE_REPORTING_UNIT_NAME,DATE
0,285382,34.346944,-117.442222,0,1605.0,0.2,158,71.0,145,0
1,285383,34.02039,-116.17997,0,1545.0,0.1,218,0.0,69,0
2,285384,38.068611,-120.276667,0,1200.0,0.1,196,109.0,170,0
3,285385,32.499971,-83.742573,1,0.0,0.4,87,153.0,47,1
4,285386,32.92494,-114.99253,0,126.0,0.1,89,25.0,18,1


Here I separate the data randomly into training and testing sets, with a 75/25 split. This process takes a moment, and so I save the output matrices locally so that it does not need to re-run in the event I restart the kernel. 

In [6]:
D = len(df_train.columns[1:-1])
N_total = len(df_train)
N = int(0.75 * N_total)
N_test = N_total - N
X_predict = df_test.to_numpy(dtype = float)[:, 1:]
try:
    X_train = np.load("./jeh_checkpoints/x_train.npy")
    Y_train = np.load("./jeh_checkpoints/y_train.npy")
    X_test = np.load("./jeh_checkpoints/x_test.npy")
    Y_test = np.load("./jeh_checkpoints/y_test.npy")
except FileNotFoundError:
    indices = np.random.choice(list(range(N_total)), size=N_total, replace=False)
    test_indices = indices[:N_test]
    train_indices = indices[N_test:]
    X_train = np.zeros([N, D], dtype = float)
    Y_train = np.zeros(N, dtype = int)
    X_test = np.zeros([N_test, D], dtype = float)
    Y_test = np.zeros(N_test, dtype = int)
    # loop through and split the data. 
    j = 0
    for i in train_indices:
        X_train[j, :] = df_train.iloc[i, 1:-1]
        Y_train[j] = df_train.iloc[i, -1]
        j += 1
    k = 0
    for i in test_indices:
        X_test[k, :] = df_train.iloc[i, 1:-1]
        Y_test[k] = df_train.iloc[i, -1]
        k += 1
    np.save("./jeh_checkpoints/x_train.npy", X_train)
    np.save("./jeh_checkpoints/y_train.npy", Y_train)
    np.save("./jeh_checkpoints/x_test.npy", X_test)
    np.save("./jeh_checkpoints/y_test.npy", Y_test)

Next I will apply a basic normalization to each column. I am careful to normalize everything using only the training data. 

In [7]:
for i in range(D):
    mu = np.mean(X_train[:, i])
    stddev = np.std(X_train[:, i])
    X_train[:, i] = (X_train[:, i] - mu ) / stddev
    X_test[:, i] = (X_test[:, i] - mu ) / stddev
    X_predict[:, i] = (X_predict[:, i] - mu ) / stddev

The data labels are set to 1-4, but in order to feed them into the neural network we require labels 0-3. 

In [8]:
# We require Y_train and Y_test to be from 0-3, not 1-4
Y_train = Y_train - 1
Y_test = Y_test - 1

I also need to onehot encode the labels to the data set. In effect, my neural net will have 4 output units and I want the labels to emulate this. 

In [9]:
C = len(np.unique(Y_train))
Y_train_oh = np.zeros([len(Y_train), C])
Y_test_oh = np.zeros([len(Y_test), C])
for i in range(len(Y_train)):
    y = int(Y_train[i])
    Y_train_oh[i, y] = 1
for i in range(len(Y_test)):
    y = int(Y_test[i])
    Y_test_oh[i, y] = 1

Next, since I am using a custom dataset, I need to write a dataset class in order to use pytorch. These classes can be written so that the entire dataset does not need to be stored on the GPU, but I happen to have 8GB of graphics memory, which should be plenty to store the dataset objects, and so I opted to keep everything on the GPU for speed. In the event a larger dataset is used, the `__getitem__()` method should read off data from a file, which will allow multiple cores of the computer to send data to the GPU concurrently. 

In [10]:
class Dataset(torch.utils.data.Dataset):
    "Dataset object for pytorch."
    def __init__(self, X, Y):
        'Initialization'
        self.Y = Y.astype(float)
        self.X = X.astype(float)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.Y)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        x = self.X[index]
        y = self.Y[index]
        return x, y

Let's use this class to actually construct dataset objects. 

In [11]:
train_dataset = Dataset(X_train, Y_train_oh)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)

Let's set the device as the GPU.

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
# When you are on a CUDA machine, this should print a CUDA device:
print(device)

cuda


We can train the nerual nets exactly like was done in problem set 4, and I will construct a helper function for this process. 

In [13]:
def train_model(model, loss_fn, train_loader, verbose=True):
    """Helper function for training pytorch nerual nets. Input the model, the desired loss
    function, and the training dataset loader."""
    # Send the model to the GPU
    model.to(device)
    
    # Ensure the model is prepared for training. 
    model.train()
    
    loss_val = 1
    epoch = 0
    for i in range(10):
        epoch += 1
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            # Erase accumulated gradients
            optimizer.zero_grad()
            
            # Forward pass
            output = model(data.float())

            # Calculate loss
            loss = loss_fn(output, target.float())

            # Backward pass
            loss.backward()

            # Weight update
            optimizer.step()
        
        loss_val = loss.item()
        # Track loss each epoch
        if verbose:
            print('Train Epoch: %d  Loss: %.4f' % (epoch,  loss_val))

I would also like helper functions for calculating the training error, the testing error, and also the AUC, since ultimately the AUC is the metric we are most interested in optimizing. 

In [14]:
def get_err(model, loss_fn, data_loader):
    """Helper function for determining the average loss and accuracy of the 
    trained model on the data passed via data_loader. Returns the average loss
    and the percent accuracy."""
    # Place model parameters in evaluation mode
    model.eval()
    correct = 0
    loss = 0
    # Turning off automatic differentiation
    with torch.no_grad():
        for data, target in data_loader:
            # Send the data to the GPU
            data, target = data.to(device), target.to(device)
            
            # Predict the data batch on the gpu
            output = model(data.float())
            
            # Determine the average loss of the batch and weight it by the number of samples
            loss += loss_fn(output, target.float()).item() * len(target) 
            
            # Get the index of the max class score
            pred = output.argmax(dim=1, keepdim=False).cpu().numpy()  
            
            # Convert the target back from onehot encoding
            target = target.cpu().numpy()
            target = target[:, 1] + target[:, 2] * 2 + target[:, 3] * 3
            
            # Determine the accuracy of the classification
            correct += np.sum(pred==target)
            
    loss /= len(data_loader.dataset)
    return loss, 100. * correct / len(data_loader.dataset)

def get_predictions(model, data_loader):
    """Helper function for determining the predicted outputs of the passed model
    on the data in the data loader. Returns an Nx4 matrix containing probabilities, as 
    well as the corresponding labels passed in the data_loader.
    """
    # Place model parameters in evaluation mode
    model.eval()
    
    # I loop through the dataset on the GPU and storee all of the predictions on 
    # the CPU because I sometimes run into errors if I calculate the AUC on 
    # small batches of data. 
    pred_data = np.zeros([len(data_loader.dataset), 4])
    true_classes = np.zeros(len(data_loader.dataset))
    i = 0
    with torch.no_grad():
        for data, y in data_loader:
            data = data.to(device)
            num = len(data)
            
            # Compute predictions
            output = model(data.float())
            
            # Store predictions
            pred_data[i:i+num, :] = output.cpu().numpy()
            
            # Convert the target back from onehot encoding
            y = y[:, 1] + y[:, 2] * 2 + y[:, 3] * 3
            true_classes[i:i+num] = y
            
            # Increment by the number of points in the batch
            i += num
            
    return pred_data, true_classes

def get_auc(model, data_loader):
    """Helper function for determining the ROC AUC of the passed model, 
    calculated on the data within the passed data loader."""
    pred_data, true_classes = get_predictions(model, data_loader)
    return roc_auc_score(true_classes, pred_data, multi_class='ovr')

Most of the boring work is done, and we now almost have the framework to start building neural nets! I was really interested in whether a very tall or a very deep net architecture would learn better on this data, keeping the number of total nodes constant. Let's write a helper function that will make models to help answer this question. 

Clearly, the model must begin with a single layer that contains the same number of features as the data. I have typically found good performance if there is a medium size buffer between the data and very large layers, and so I included layers with 100 units before and after the bulk of the model. For all layers, I have included a constant but small dropout rate and a `ReLU()` activation function, which is a current industry favorite. Finally, I would like to interpret the last layer as a probability distribution over four classes, and so I have applied `Softmax()` to this layer in order to rescale the outputs to fit the correct form. 

In [15]:
def build_model(height, depth, drop=0.05, narrow=100):
    """Constructs and returns a neural netowrk architecture. The network
    inputs D data values, contains a single layer with 'narrow' nodes, then
    contains 'depth' layers, each with 'height' nodes. Lastly, there is one more
    layer with 'narrow' nodes and then a 4-node softmax output layer. Most nodes
    are given relu activations and dropout rates 'drop'.
    Returns the model and optimizer objects."""
    ordict = OrderedDict()
    ordict['start'] = nn.Linear(D, narrow)
    ordict['expand1'] = nn.Linear(narrow, height)
    ordict['expand2'] = nn.Dropout(drop)
    ordict['expand3'] = nn.ReLU()
    # Construct the bulk of the net
    for i in range(depth):
        ordict['1-%i'%i] = nn.Linear(height, height)
        ordict['2-%i'%i] = nn.ReLU()
        ordict['3-%i'%i] = nn.Dropout(drop)

    # Narrow the net and bring it down to the last few nodes
    ordict['narrow1'] = nn.Linear(height, narrow)
    ordict['narrow2'] = nn.ReLU()
    ordict['narrow3'] = nn.Dropout(drop)
    ordict['final-1'] = nn.Linear(narrow, C)
    ordict['final-2'] = nn.Softmax(dim=1)

    # Pack all the layers into the model
    model = nn.Sequential(ordict)
    
    # create the optimizer from the model
    optimizer = torch.optim.Adam(model.parameters(), lr=float(1e-4))
    
    return model, optimizer

I chose to use an Adam optimizer, as was done in the problem set. I also chose to use `SmoothL1Loss`, which gave me the best performance by far among several of the native tensorflow loss functions that I attempted. Given that this is a classification problem and not regression, L1 loss seemed more suitable than squared loss. `CrossEntropyLoss` was a good candidate, but yielded worse performance than did L1 loss, I am unsure as to exactly why. 

In [16]:
loss_fn = nn.SmoothL1Loss()

Let's create an example net and test the performance. 

In [17]:
model, optimizer = build_model(1000, 5)
print(model)

Sequential(
  (start): Linear(in_features=9, out_features=100, bias=True)
  (expand1): Linear(in_features=100, out_features=1000, bias=True)
  (expand2): Dropout(p=0.05, inplace=False)
  (expand3): ReLU()
  (1-0): Linear(in_features=1000, out_features=1000, bias=True)
  (2-0): ReLU()
  (3-0): Dropout(p=0.05, inplace=False)
  (1-1): Linear(in_features=1000, out_features=1000, bias=True)
  (2-1): ReLU()
  (3-1): Dropout(p=0.05, inplace=False)
  (1-2): Linear(in_features=1000, out_features=1000, bias=True)
  (2-2): ReLU()
  (3-2): Dropout(p=0.05, inplace=False)
  (1-3): Linear(in_features=1000, out_features=1000, bias=True)
  (2-3): ReLU()
  (3-3): Dropout(p=0.05, inplace=False)
  (1-4): Linear(in_features=1000, out_features=1000, bias=True)
  (2-4): ReLU()
  (3-4): Dropout(p=0.05, inplace=False)
  (narrow1): Linear(in_features=1000, out_features=100, bias=True)
  (narrow2): ReLU()
  (narrow3): Dropout(p=0.05, inplace=False)
  (final-1): Linear(in_features=100, out_features=4, bias=True)


Now I will train the model and report the error. 

In [18]:
train_model(model, loss_fn, train_loader)
# Report the training error, accuracy, and AUC
print("Average Training loss: %.4f \nAverage Training Accuracy: %.2f"
      %get_err(model, loss_fn, train_loader))
print("Average Training ROC AUC: %.3f"
      %get_auc(model, train_loader)) 

Train Epoch: 1  Loss: 0.0669
Train Epoch: 2  Loss: 0.0660
Train Epoch: 3  Loss: 0.0570
Train Epoch: 4  Loss: 0.0618
Train Epoch: 5  Loss: 0.0665
Train Epoch: 6  Loss: 0.0607
Train Epoch: 7  Loss: 0.0714
Train Epoch: 8  Loss: 0.0553
Train Epoch: 9  Loss: 0.0758
Train Epoch: 10  Loss: 0.0725
Average Training loss: 0.0628 
Average Training Accuracy: 62.87
Average Training ROC AUC: 0.779


In [19]:
test_dataset = Dataset(X_test, Y_test_oh)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False)
# Report the testing error
print("Average Testing loss: %.4f \nAverage Training Accuracy: %.2f"
      %get_err(model, loss_fn, test_loader))
print("Average Testing ROC AUC: %.3f"
      %get_auc(model, test_loader))

Average Testing loss: 0.0630 
Average Training Accuracy: 62.86
Average Testing ROC AUC: 0.774


The network does appear to be learning! I'd like to visualize the performance of this network on both sets, and since I have latitude and longitude data I should be able to plot the data like a map. Of course, the latitude and longitude data has been normalized, and so the dimensions will be somewhat distorted. For the training and the testing sets, I will make two side-by-side plots, where the left plot reflects the real classifications and the right plot is the nerual net predictions. 

After making this plot, I quickly realized that it would be best to plot one state at a time, and so I separated them.

In [20]:
def make_plot(X, Y, left_title, right_title):
    # I have to produce another dataloader for the training set 
    # because I don't want to shuffle the points. 
    plot_dataset  = Dataset(X, Y)
    plot_train_loader = torch.utils.data.DataLoader(plot_dataset, batch_size=512, shuffle=False)
    pred_prob, true_class = get_predictions(model, plot_train_loader)
    pred_class = pred_prob.argmax(axis=1)
    lat, long = X[:, 0], X[:, 1]

    color_map = {0:"black", 1:"green", 2:"blue", 3:"orange"}
    true_color = list(map(lambda x: color_map[x], true_class))
    pred_color = list(map(lambda x: color_map[x], pred_class))

    # Generate scatterplot
    fig_left = bokeh.plotting.Figure(height=400,
                                     width=400,
                                     title=left_title)
    fig_left.circle(lat, long, color=true_color, alpha = 0.5)

    fig_right = bokeh.plotting.Figure(height=400,
                                      width=400,
                                      title=right_title)
    fig_right.circle(lat, long, color=pred_color, alpha = 0.5)
    return bokeh.layouts.row([fig_left, fig_right])

# I am limiting the number of points so that the plot is clearer
num_pts = 7000
X_CA_train = X_train[X_train[:, 2] < 0][:num_pts]
Y_CA_train = Y_train_oh[X_train[:, 2] < 0][:num_pts]
top = make_plot(X_CA_train, Y_CA_train, 
                "True Classes for CA Training Set",
                "Predicted Classes for CA Training Set")
X_GA_train = X_train[X_train[:, 2] > 0][:num_pts]
Y_GA_train = Y_train_oh[X_train[:, 2] > 0][:num_pts]
bottom = make_plot(X_GA_train, Y_GA_train, 
                "True Classes for GA Training Set",
                "Predicted Classes for GA Training Set")
bokeh.io.show(bokeh.layouts.column([top, bottom]))

Now let's do the same for the testing set:

In [21]:
X_CA_test = X_test[X_test[:, 2] < 0][:num_pts]
Y_CA_test = Y_test_oh[X_test[:, 2] < 0][:num_pts]
top = make_plot(X_CA_test, Y_CA_test, 
                "True Classes for CA Testing Set",
                "Predicted Classes for CA Testing Set")
X_GA_test = X_test[X_test[:, 2] > 0][:num_pts]
Y_GA_test = Y_test_oh[X_test[:, 2] > 0][:num_pts]
bottom = make_plot(X_GA_test, Y_GA_test, 
                "True Classes for GA Testing Set",
                "Predicted Classes for GA Testing Set")
bokeh.io.show(bokeh.layouts.column([top, bottom]))

These plots are very cool, and they suggest how the network is using location data, among other factors, to infer the cause of the fires. We can see that we are not overfitting the data, as the testing and training performance is quite similar. Let's try optimizing the network architecture. Keeping the total number of nodes constant, let's alter the height and the depth of the network.

In [22]:
# first I will generate a spectrum of different heights and depths,
# keeping the number of nodes constant. I explore the space logarithmically.
attempts = 10
nodes = 20000
heights = np.zeros(attempts, dtype=int)
depths = np.zeros(attempts, dtype=int)
h = 100
d = int(nodes / h)
for i in range(attempts):
    h = h * 1.5
    d = d / 1.5
    depths[i] = int(d)
    heights[i] = int(h)
train_loss, train_acc, train_auc, test_loss, test_acc, test_auc = [0]*6 
try:
    train_loss = np.load("./jeh_checkpoints/train_loss.npy")
    train_acc = np.load("./jeh_checkpoints/train_acc.npy")
    train_auc = np.load("./jeh_checkpoints/train_auc.npy")
    test_loss = np.load("./jeh_checkpoints/test_loss.npy")
    test_acc = np.load("./jeh_checkpoints/test_acc.npy")
    test_auc = np.load("./jeh_checkpoints/test_auc.npy")
except FileNotFoundError:
    # Now I will create and train models, recording the loss, accuracy, and auc
    train_loss = np.zeros(attempts)
    train_acc = np.zeros(attempts)
    train_auc = np.zeros(attempts)
    test_loss = np.zeros(attempts)
    test_acc = np.zeros(attempts)
    test_auc = np.zeros(attempts)
    for i in range(attempts):
        print("Training model with depth %i."%depths[i])
        # Construct model
        model, optimizer = build_model(heights[i], depths[i])
        # Train the model
        train_model(model, loss_fn, train_loader, verbose=True)
        # Record loss
        train_loss[i], train_acc[i] = get_err(model, loss_fn, train_loader)
        train_auc[i] = get_auc(model, train_loader)
        test_loss[i], test_acc[i] = get_err(model, loss_fn, test_loader)
        test_auc[i] = get_auc(model, test_loader)
        
    # Save the results so that I don't need to retrain all the neural nets again
    np.save("./jeh_checkpoints/train_loss.npy", train_loss)
    np.save("./jeh_checkpoints/train_acc.npy", train_acc)
    np.save("./jeh_checkpoints/train_auc.npy", train_auc)
    np.save("./jeh_checkpoints/test_loss.npy", test_loss)
    np.save("./jeh_checkpoints/test_acc.npy", test_acc)
    np.save("./jeh_checkpoints/test_auc.npy", test_auc)

Now we can plot the errors that resulted across all training. 

In [23]:
auc_plot = bokeh.plotting.Figure(title="ROC AUC vs. Model Depth",
                                 x_axis_label = "Model Depth (layers)",
                                 y_axis_label = "ROC AUC Score",
                                 x_axis_type = 'log',
                                 height = 400, width = 300)
auc_plot.line(depths, train_auc, legend_label = "Training", color = "blue")
auc_plot.line(depths, test_auc, legend_label = "Testing", color = "green")

acc_plot = bokeh.plotting.Figure(title="Accuracy vs. Model Depth",
                                 x_axis_label = "Model Depth (layers)",
                                 y_axis_label = "Model Accuracy",
                                 x_axis_type = 'log',
                                 height = 400, width = 300)
acc_plot.line(depths, train_acc, legend_label = "Training", color = "blue")
acc_plot.line(depths, test_acc, legend_label = "Testing", color = "green")

loss_plot = bokeh.plotting.Figure(title="Smooth L1 Loss vs. Model Depth",
                                 x_axis_label = "Model Depth (layers)",
                                 y_axis_label = "L1 Loss", 
                                 x_axis_type = 'log',
                                 height = 400, width = 300)
loss_plot.line(depths, train_loss, legend_label = "Training", color = "blue")
loss_plot.line(depths, test_loss, legend_label = "Testing", color = "green")
loss_plot.legend.location = "top_left"
bokeh.io.show(bokeh.layouts.row([auc_plot, acc_plot, loss_plot]))

We can clearly observe that shallow, tall nerual nets perform much better than short, deep nets, given that the number of nodes is held constant. However, in the limit of the extremely shallow network, the gains in accuracy quickly plateau. 

In [24]:
print("The shallowest network tested had depth = %i."%depths[-1])

The shallowest network tested had depth = 3.


Let's see if we can improve the model accuracy by going even shallower. 

In [25]:
model, optimizer = build_model(10000, 2)
print(model)

Sequential(
  (start): Linear(in_features=9, out_features=100, bias=True)
  (expand1): Linear(in_features=100, out_features=10000, bias=True)
  (expand2): Dropout(p=0.05, inplace=False)
  (expand3): ReLU()
  (1-0): Linear(in_features=10000, out_features=10000, bias=True)
  (2-0): ReLU()
  (3-0): Dropout(p=0.05, inplace=False)
  (1-1): Linear(in_features=10000, out_features=10000, bias=True)
  (2-1): ReLU()
  (3-1): Dropout(p=0.05, inplace=False)
  (narrow1): Linear(in_features=10000, out_features=100, bias=True)
  (narrow2): ReLU()
  (narrow3): Dropout(p=0.05, inplace=False)
  (final-1): Linear(in_features=100, out_features=4, bias=True)
  (final-2): Softmax(dim=1)
)


In [26]:
train_model(model, loss_fn, train_loader)
# Report the training error, accuracy, and AUC
print("Average Training loss: %.4f \nAverage Training Accuracy: %.2f"
      %get_err(model, loss_fn, train_loader))
print("Average Training ROC AUC: %.3f"
      %get_auc(model, train_loader)) 

Train Epoch: 1  Loss: 0.0467
Train Epoch: 2  Loss: 0.0509
Train Epoch: 3  Loss: 0.0580
Train Epoch: 4  Loss: 0.0885
Train Epoch: 5  Loss: 0.0971
Train Epoch: 6  Loss: 0.0495
Train Epoch: 7  Loss: 0.0603
Train Epoch: 8  Loss: 0.0656
Train Epoch: 9  Loss: 0.0517
Train Epoch: 10  Loss: 0.0608
Average Training loss: 0.0618 
Average Training Accuracy: 63.70
Average Training ROC AUC: 0.786


In [27]:
# Report the testing error
print("Average Testing loss: %.4f \nAverage Training Accuracy: %.2f"
      %get_err(model, loss_fn, test_loader))
print("Average Testing ROC AUC: %.3f"
      %get_auc(model, test_loader))

Average Testing loss: 0.0622 
Average Training Accuracy: 63.71
Average Testing ROC AUC: 0.781


The benefits we get for fine-tuning the depth appear pretty small, and a similar analysis on the other neural net parameters yielded small benefits as well. Clearly, our model is learning from the data, but achieving a ROC AUC score higher than around 0.78 likely requires superior data preprocessing techniques or a different model class. 