In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from tqdm.notebook import tqdm
from typing import List, Tuple, Union

import candle.functions as F
import candle.optimizer

## Load data

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

digits = datasets.load_digits()

X = digits.images.reshape((len(digits.images), -1))  # Flatten the images
X = (X - 8) / 8  # Normalize features between 0 and 1

N = 16  # Training data is just only the first N examples
X_train = X[:16]
y_train = digits.target[:16]

NUM_FEATURES = X_train.shape[1]
NUM_CLASSES = len(set(y_train.data))

In [4]:
X_train = candle.Tensor(X_train)
y_train = candle.Tensor(y_train)

X_train.shape, y_train.shape

((16, 64), (16,))

## Define MLP of Varying Depths

In [5]:
class MLP(candle.Module):
    
    def __init__(self,
                 input_size: int,
                 hidden_sizes: List[int]):
        
        self.linear_layers = candle.ParameterList([
            candle.Linear(i, j)
            for (i, j) in zip([input_size] + hidden_sizes, hidden_sizes)
        ])
        
        
    def forward(self, x):
        for linear_layer in self.linear_layers[:-1]:
            x = linear_layer(x)
            x = F.relu(x)
            
        x = self.linear_layers[-1](x)
            
        return x      

In [6]:
def create_mlp(depth: int, width: int):
    return MLP(input_size=NUM_FEATURES,
               hidden_sizes=(depth - 1) * [width] + [NUM_CLASSES])

## Experiment 1: (Gradient Norm and Loss vs. Depth) by Iteration

> We investigate the relationship between depth and {gradient norm, loss} by iterations for our simple MLPs under Kaiming initilization.

In [7]:
def get_grad_norm_and_loss_by_iteration(model,
                                        iterations: int,
                                        learning_rate: float):
    """Returns a DataFrame with columns ['grad_norm', 'loss'] and index 'iteration'."""
    optimizer = candle.optimizer.SGD(model.parameters(),
                                       learning_rate=learning_rate,
                                       weight_decay=0.0)
    model.train()

    grad_norm_and_loss_df = pd.DataFrame(columns=['grad_norm', 'loss'], index=pd.Index([], name='iteration'))

    for iteration in range(iterations):
        output = model(X_train)
        loss = F.cross_entropy_loss(output, y_train)
        loss.backward()

        grad = np.concatenate([p.grad.flatten() for p in model.parameters().values()])
        grad_norm = np.sqrt((grad ** 2).sum())

        grad_norm_and_loss_df.loc[iteration, 'grad_norm'] = grad_norm
        grad_norm_and_loss_df.loc[iteration, 'loss'] = float(loss.data)

        optimizer.step()

    return grad_norm_and_loss_df

In [8]:
TRIALS = 100
DEPTHS = list(range(2, 32, 2))
WIDTH = 128

LEARNING_RATE = 3e-3
ITERATIONS = 500

In [9]:
all_trial_results = []
for trial in range(TRIALS):
    if trial % 10 == 0:
        print(trial)
        
    for depth in DEPTHS:
        model = create_mlp(depth, WIDTH)
        grad_norm_and_loss_df = get_grad_norm_and_loss_by_iteration(model,
                                                                    iterations=ITERATIONS,
                                                                    learning_rate=LEARNING_RATE)

        grad_norm_and_loss_df['trial'] = trial
        grad_norm_and_loss_df['depth'] = depth
        all_trial_results.append(grad_norm_and_loss_df)
        
all_trial_results = pd.concat(all_trial_results)
all_trial_results['grad_norm_squared'] = all_trial_results['grad_norm'] ** 2

0
10
20
30
40
50
60
70
80
90


In [19]:
iteration = 1
metric_to_plot = 'grad_norm_squared'

In [20]:
mean = all_trial_results.loc[iteration].groupby(['depth']).mean()
std = all_trial_results.loc[iteration].groupby(['depth']).std() / np.sqrt(TRIALS)

data_to_plot = (mean[[metric_to_plot]]
                .assign(min=mean[metric_to_plot] - std[metric_to_plot],
                        max=mean[metric_to_plot] + std[metric_to_plot])).reset_index()

In [45]:
def plot(iterations_to_plot: List[int],
         metric_to_plot: str,
         metric_label: str):

    for iteration in iterations_to_plot:
        mean = all_trial_results.loc[iteration].groupby(['depth']).mean()
        std = all_trial_results.loc[iteration].groupby(['depth']).std() / np.sqrt(TRIALS)

        data_to_plot = (mean[[metric_to_plot]]
                        .assign(min=mean[metric_to_plot] - std[metric_to_plot],
                                max=mean[metric_to_plot] + std[metric_to_plot])).reset_index()

        base = alt.Chart(data_to_plot).properties(height=400, width=600, title=f'Iteration {iteration}: {metric_label} vs. Model Depth')

        points = (base.mark_point(filled=True, size=50, color='black')
                  .encode(alt.X('depth', title='Model Depth'),
                          alt.Y(metric_to_plot)))

#         best_fit_line = points.transform_regression('depth', metric_to_plot).mark_line(color='black', strokedepth=0.8) 

        errorbars = (base.mark_errorbar(ticks=True)
                     .encode(alt.X('depth'),
                             alt.Y('min:Q', title=metric_label),
                             alt.Y2('max:Q'),
                             color=alt.value('#4682b4')))

        chart = points + errorbars

        display(chart)

In [46]:
plot(iterations_to_plot=range(0, 100, 10),
     metric_to_plot='grad_norm_squared',
     metric_label='Squared Gradient Norm')

In [47]:
plot(iterations_to_plot=range(0, 100, 10),
     metric_to_plot='loss',
     metric_label='Loss')