# Weight initialisation techniques in PyTorch

Data used for this notebook is from a Kaggle competition  
Link to the competition: https://www.kaggle.com/c/santander-customer-transaction-prediction  
Type of Problem: Classification  

## Import libraries

In [1]:
import torch
from torch import nn

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

## Get training data

In [2]:
df_train = pd.read_csv('data/train.csv')

print(df_train.shape)
df_train.head(2)

(200000, 202)


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518


In [3]:
df_train.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


## Pre-processing data

In [4]:
var_columns = [c for c in df_train.columns if c not in ('ID_code','target')]
len(var_columns)

200

In [5]:
scaler = MinMaxScaler()

df_train[var_columns] = scaler.fit_transform(df_train[var_columns])
df_train.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,0.515985,0.527761,0.498848,0.516818,0.517698,0.551997,0.501877,0.501123,0.52233,...,0.532601,0.52195,0.470032,0.502746,0.483477,0.536917,0.507605,0.488022,0.483899,0.52746
std,0.300653,0.152716,0.159324,0.153221,0.154463,0.139968,0.157852,0.142057,0.152988,0.161333,...,0.140158,0.155773,0.121015,0.132779,0.162998,0.149925,0.167666,0.152592,0.152223,0.154974
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.40416,0.405322,0.383234,0.400217,0.414637,0.428839,0.396761,0.384659,0.396368,...,0.431376,0.404422,0.385063,0.411373,0.360573,0.429027,0.376952,0.379516,0.380712,0.410436
50%,0.0,0.508191,0.52853,0.491004,0.51897,0.520277,0.556658,0.497967,0.497138,0.527633,...,0.531653,0.517279,0.46786,0.504894,0.481614,0.533706,0.51085,0.484718,0.4871,0.534987
75%,0.0,0.620387,0.645236,0.603369,0.632294,0.619692,0.672246,0.599256,0.615573,0.650798,...,0.630091,0.628818,0.553659,0.59834,0.608396,0.638836,0.638353,0.60146,0.59482,0.648661
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Split training data into dependent and independent variables

In [6]:
X_np = df_train.loc[:, var_columns].to_numpy()
y_np = df_train.loc[:, 'target'].to_numpy()

In [7]:
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).reshape(-1,1)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

(torch.Size([160000, 200]),
 torch.Size([40000, 200]),
 torch.Size([160000, 1]),
 torch.Size([40000, 1]))

## 1. Default weight initializations

In [14]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(200, 10)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(10, 1)
        self.act2 = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        return x

In [15]:
def train_model(model):
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

    batch_size = 1000
    n_epochs = 15

    for epoch in range(n_epochs):
        for b in range(0, X_train.shape[0], batch_size):

            # Get data in batches
            X_train_batch = X_train[b:b+batch_size]
            y_train_batch = y_train[b:b+batch_size]

            # Make predictions
            y_train_batch_pred = model(X_train_batch)
            y_val_pred = model(X_val)

            # Calculate loss
            loss_train = loss_fn(y_train_batch_pred, y_train_batch)
            loss_val = loss_fn(y_val_pred, y_val)

            # Backpropagation
            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
        
        print(f'Epoch {epoch}, training loss {loss_train}, validation loss {loss_val}')

    return model

In [16]:
def print_auc(model, X_val, y_val):
    y_val_pred = model(X_val)

    fpr, tpr, _ = roc_curve(y_val.detach().numpy(), y_val_pred.detach().numpy())
    roc_auc = auc(fpr, tpr)

    print(f"\nROC AUC: {roc_auc}")

In [17]:
model = Model()
model = train_model(model)

print_auc(model, X_val, y_val)

Epoch 0, training loss 0.20981259644031525, validation loss 0.27239277958869934
Epoch 1, training loss 0.18198689818382263, validation loss 0.2407095730304718
Epoch 2, training loss 0.18049003183841705, validation loss 0.2396184504032135
Epoch 3, training loss 0.181327223777771, validation loss 0.2433619350194931
Epoch 4, training loss 0.18051311373710632, validation loss 0.24122726917266846
Epoch 5, training loss 0.18088243901729584, validation loss 0.2419077605009079
Epoch 6, training loss 0.18101944029331207, validation loss 0.24195095896720886
Epoch 7, training loss 0.1809610277414322, validation loss 0.24128766357898712
Epoch 8, training loss 0.18088066577911377, validation loss 0.24092021584510803
Epoch 9, training loss 0.18076667189598083, validation loss 0.2405151128768921
Epoch 10, training loss 0.18068315088748932, validation loss 0.2401895970106125
Epoch 11, training loss 0.18060536682605743, validation loss 0.23987551033496857
Epoch 12, training loss 0.1805875599384308, val

## 2. Zero Initialization

- Setting all weights to zero
- While simple, it's generally not recommended the network will not be able to learn any meaningful patterns
- Also, if all weights are set to the same non-zero value, the neurons in the same layer would learn the same features and may not break symmetry during training

https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.zeros_

In [20]:
class ModelZero(nn.Module):
    def __init__(self):
        super(ModelZero, self).__init__()
        self.fc1 = nn.Linear(200, 10)
        nn.init.zeros_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        self.act1 = nn.ReLU()
        
        self.fc2 = nn.Linear(10, 1)
        nn.init.zeros_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        self.act2 = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        return x

In [21]:
model = ModelZero()

In [22]:
print("Initial Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Initial Weights:
fc1.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
fc1.bias tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
fc2.weight tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
fc2.bias tensor([0.])


In [23]:
model = train_model(model)

print_auc(model, X_val, y_val)

Epoch 0, training loss 0.27075910568237305, validation loss 0.33037933707237244
Epoch 1, training loss 0.270715594291687, validation loss 0.3303798735141754
Epoch 2, training loss 0.2708769142627716, validation loss 0.33037886023521423
Epoch 3, training loss 0.27125558257102966, validation loss 0.33038830757141113
Epoch 4, training loss 0.27178576588630676, validation loss 0.33042722940444946
Epoch 5, training loss 0.27236074209213257, validation loss 0.33049848675727844
Epoch 6, training loss 0.2728714942932129, validation loss 0.33058375120162964
Epoch 7, training loss 0.2732606828212738, validation loss 0.3306611478328705
Epoch 8, training loss 0.27352845668792725, validation loss 0.3307199776172638
Epoch 9, training loss 0.27370473742485046, validation loss 0.3307611048221588
Epoch 10, training loss 0.27381816506385803, validation loss 0.3307884633541107
Epoch 11, training loss 0.2738843858242035, validation loss 0.33080488443374634
Epoch 12, training loss 0.27390938997268677, vali

In [24]:
print("Trained Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)


ROC AUC: 0.5


## 3. Random Initialization (Uniform or Normal)

- Initializing weights randomly from a uniform or normal distribution
- This is a common practice to break symmetry
- The weights can be drawn from a distribution with a mean of 0 and a small variance

https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.uniform_
https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.normal_

In [37]:
class ModelNormal(nn.Module):
    def __init__(self):
        super(ModelNormal, self).__init__()
        self.fc1 = nn.Linear(200, 10)
        nn.init.normal_(self.fc1.weight, mean=0, std=0.01)
        nn.init.normal_(self.fc1.bias, mean=0, std=0.01)
        self.act1 = nn.ReLU()
        
        self.fc2 = nn.Linear(10, 1)
        nn.init.normal_(self.fc2.weight, mean=0, std=0.01)
        nn.init.normal_(self.fc2.bias, mean=0, std=0.01)
        self.act2 = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        return x

In [38]:
model = ModelNormal()

In [39]:
print("Initial Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Initial Weights and Bias:
fc1.weight tensor([[ 2.7015e-03, -1.1805e-02,  7.5538e-03,  ..., -6.7114e-03,
         -3.1334e-03, -8.3359e-03],
        [ 4.6629e-03,  2.2604e-03,  5.6639e-04,  ..., -3.3982e-03,
          5.0068e-05,  2.5671e-03],
        [ 1.0387e-02,  1.0421e-02, -8.8279e-03,  ..., -2.1859e-02,
         -4.2260e-03,  2.2272e-03],
        ...,
        [-6.8930e-04, -8.0790e-03, -1.3315e-02,  ..., -7.1349e-04,
          1.5262e-02,  6.5075e-03],
        [-4.8207e-03,  2.4344e-03, -2.3114e-02,  ...,  7.2144e-03,
          2.1872e-02,  3.3376e-03],
        [ 4.0601e-03, -2.9588e-04, -1.3732e-02,  ...,  8.7980e-03,
         -6.6848e-03, -5.3351e-05]])
fc1.bias tensor([-0.0119,  0.0024,  0.0031, -0.0046, -0.0095,  0.0017,  0.0149, -0.0175,
        -0.0043,  0.0082])
fc2.weight tensor([[ 0.0056,  0.0063, -0.0021, -0.0097, -0.0024, -0.0222,  0.0103, -0.0109,
         -0.0061,  0.0064]])
fc2.bias tensor([-0.0036])


In [40]:
model = train_model(model)

print_auc(model, X_val, y_val)

Epoch 0, training loss 0.19017332792282104, validation loss 0.25739043951034546
Epoch 1, training loss 0.18578745424747467, validation loss 0.2525328993797302
Epoch 2, training loss 0.18091945350170135, validation loss 0.23968720436096191
Epoch 3, training loss 0.1808839589357376, validation loss 0.23908965289592743
Epoch 4, training loss 0.18148185312747955, validation loss 0.24301943182945251
Epoch 5, training loss 0.1813228577375412, validation loss 0.24326050281524658
Epoch 6, training loss 0.18113715946674347, validation loss 0.24277807772159576
Epoch 7, training loss 0.18091465532779694, validation loss 0.24212539196014404
Epoch 8, training loss 0.1807674765586853, validation loss 0.2413400560617447
Epoch 9, training loss 0.1807074397802353, validation loss 0.24100083112716675
Epoch 10, training loss 0.1806938797235489, validation loss 0.24066118896007538
Epoch 11, training loss 0.18066376447677612, validation loss 0.24011307954788208
Epoch 12, training loss 0.18063747882843018, 

In [41]:
print("Trained Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Trained Weights and Bias:
fc1.weight tensor([[-0.5978, -0.6123, -0.5930,  ..., -0.6072, -0.6037, -0.6089],
        [-0.5959, -0.5983, -0.6000,  ..., -0.6039, -0.6005, -0.5980],
        [-0.2468, -0.2678, -0.2771,  ..., -0.2523, -0.2361, -0.2607],
        ...,
        [-1.0152, -0.9671, -1.1459,  ...,  0.6120,  0.9139, -0.6406],
        [-0.1110, -0.1356, -0.1514,  ..., -0.0550, -0.0527, -0.1155],
        [-0.5965, -0.6008, -0.6143,  ..., -0.5917, -0.6072, -0.6006]])
fc1.bias tensor([-0.6124, -0.5982, -0.2501, -0.1897, -0.2216,  1.4588, -0.5857, -0.0634,
        -0.1068, -0.5924])
fc2.weight tensor([[-0.5949, -0.5942,  0.2637,  0.0727,  0.2300, -0.4897, -0.5903, -0.4697,
          0.2174, -0.5941]])
fc2.bias tensor([1.3593])


## 4. Xavier Initialization

- This initialization is designed to work well with activation functions like tanh or sigmoid
- The weights are initialized by drawing from a distribution with a mean of 0 and a variance of $\frac{1}{number of input units}$

https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_
https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_normal_

In [65]:
class ModelXavier(nn.Module):
    def __init__(self):
        super(ModelXavier, self).__init__()
        self.fc1 = nn.Linear(200, 10)
        nn.init.xavier_normal_(self.fc1.weight, gain=0.01)
        nn.init.normal_(self.fc1.bias, mean=0, std=0.01)
        self.act1 = nn.ReLU()
        
        self.fc2 = nn.Linear(10, 1)
        nn.init.xavier_normal_(self.fc2.weight, gain=0.1)
        nn.init.normal_(self.fc1.bias, mean=0, std=0.01)
        self.act2 = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        return x

In [66]:
model = ModelXavier()

In [67]:
print("Initial Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Initial Weights and Bias:
fc1.weight tensor([[-2.5966e-04, -1.0212e-03, -5.9727e-04,  ..., -4.7122e-04,
          1.1819e-03, -5.7711e-06],
        [-4.1296e-05, -5.4345e-04,  4.6347e-04,  ..., -1.7259e-03,
         -1.0085e-03,  5.8473e-04],
        [-6.5564e-04,  6.8766e-04,  5.6653e-04,  ..., -1.7195e-03,
          6.2336e-04, -2.7550e-04],
        ...,
        [ 1.3343e-03, -4.9700e-04,  6.7129e-04,  ...,  5.7118e-04,
         -1.7764e-03,  1.5043e-04],
        [ 4.7460e-04,  1.4797e-03,  1.2245e-03,  ..., -1.1826e-03,
         -7.4882e-04,  4.4720e-04],
        [-8.8914e-04,  1.6558e-03,  1.9643e-03,  ...,  7.2793e-04,
         -1.2370e-04, -9.4301e-04]])
fc1.bias tensor([ 0.0079, -0.0061, -0.0105, -0.0165, -0.0117, -0.0026, -0.0103,  0.0005,
        -0.0043, -0.0228])
fc2.weight tensor([[ 0.0212, -0.0477,  0.0423,  0.0379, -0.0147,  0.0505,  0.0691, -0.0131,
         -0.0659,  0.0159]])
fc2.bias tensor([0.1183])


In [68]:
model = train_model(model)

print_auc(model, X_val, y_val)

Epoch 0, training loss 0.19058926403522491, validation loss 0.2590586543083191
Epoch 1, training loss 0.18031957745552063, validation loss 0.24043747782707214
Epoch 2, training loss 0.18091513216495514, validation loss 0.23802395164966583
Epoch 3, training loss 0.18106231093406677, validation loss 0.24094155430793762
Epoch 4, training loss 0.18134278059005737, validation loss 0.24259433150291443
Epoch 5, training loss 0.18132498860359192, validation loss 0.24312929809093475
Epoch 6, training loss 0.18106289207935333, validation loss 0.24250295758247375
Epoch 7, training loss 0.18086721003055573, validation loss 0.24153940379619598
Epoch 8, training loss 0.18079720437526703, validation loss 0.2412882298231125
Epoch 9, training loss 0.18072886765003204, validation loss 0.24089324474334717
Epoch 10, training loss 0.1806761920452118, validation loss 0.24055491387844086
Epoch 11, training loss 0.1806531697511673, validation loss 0.24002587795257568
Epoch 12, training loss 0.1806255429983139

In [69]:
print("Trained Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Trained Weights and Bias:
fc1.weight tensor([[-6.0081e-01, -6.0157e-01, -6.0114e-01,  ..., -6.0102e-01,
         -5.9937e-01, -6.0055e-01],
        [-4.1296e-05, -5.4345e-04,  4.6347e-04,  ..., -1.7259e-03,
         -1.0085e-03,  5.8473e-04],
        [-6.5564e-04,  6.8766e-04,  5.6653e-04,  ..., -1.7195e-03,
          6.2336e-04, -2.7550e-04],
        ...,
        [-2.9917e-01, -3.6219e-01, -3.3916e-01,  ..., -1.7079e-01,
         -1.9885e-01, -3.1959e-01],
        [-2.2273e+00, -1.9612e+00, -2.3008e+00,  ...,  1.7770e+00,
          2.5582e+00, -9.8628e-01],
        [-6.0082e-01, -5.9848e-01, -5.9813e-01,  ..., -5.9934e-01,
         -6.0007e-01, -6.0102e-01]])
fc1.bias tensor([-0.5926, -0.0061, -0.0105, -0.0165, -0.8813, -0.6031, -0.6108, -0.2727,
         1.4858, -0.6231])
fc2.weight tensor([[-0.5794, -0.0477,  0.0423,  0.0379,  0.9200, -0.5500, -0.5314,  0.2287,
         -0.4831, -0.5781]])
fc2.bias tensor([1.3658])


## 5. He Initialization

- This initialization is designed for ReLU and its variants
- The weights are initialized by drawing from a distribution with a mean of 0 and a variance of $\frac{2}{number of input units}$

https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_uniform_
https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_

In [70]:
class ModelHe(nn.Module):
    def __init__(self):
        super(ModelHe, self).__init__()
        self.fc1 = nn.Linear(200, 10)
        nn.init.kaiming_normal_(self.fc1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.normal_(self.fc1.bias, mean=0, std=0.01)
        self.act1 = nn.ReLU()
        
        self.fc2 = nn.Linear(10, 1)
        nn.init.xavier_normal_(self.fc2.weight, gain=0.1)
        nn.init.normal_(self.fc1.bias, mean=0, std=0.01)
        self.act2 = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        return x

In [71]:
model = ModelHe()

In [72]:
print("Initial Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Initial Weights and Bias:
fc1.weight tensor([[-3.1251e-02,  1.0257e-01,  2.6924e-02,  ...,  1.1328e-01,
          6.5151e-02,  7.2830e-02],
        [ 1.6555e-01,  2.9367e-01, -3.3813e-02,  ...,  1.5567e-01,
          1.8278e-01,  1.9932e-02],
        [ 2.4959e-04, -7.1816e-02, -1.0658e-01,  ..., -5.7668e-02,
          1.9273e-02,  1.2826e-02],
        ...,
        [-8.7776e-02,  7.0288e-02,  9.0008e-02,  ...,  4.0887e-02,
         -2.0022e-02, -1.2602e-02],
        [-1.0190e-01,  1.3763e-02,  5.2266e-02,  ..., -7.9132e-02,
          1.0709e-01,  3.1396e-02],
        [ 1.8228e-01,  7.0387e-03,  3.3341e-02,  ...,  4.2306e-02,
          2.6777e-02,  4.1782e-02]])
fc1.bias tensor([ 1.0474e-02,  1.3299e-02, -2.0440e-02,  4.4785e-03, -2.7267e-02,
        -4.9259e-03,  5.9543e-04,  1.3017e-02, -3.7155e-05, -1.0386e-02])
fc2.weight tensor([[ 0.0227, -0.0470, -0.0505, -0.0272, -0.0105, -0.1039,  0.0655, -0.0221,
         -0.0168,  0.0188]])
fc2.bias tensor([-0.2996])


In [73]:
model = train_model(model)

print_auc(model, X_val, y_val)

Epoch 0, training loss 0.18820826709270477, validation loss 0.24117988348007202
Epoch 1, training loss 0.18081921339035034, validation loss 0.23843751847743988
Epoch 2, training loss 0.18089459836483002, validation loss 0.2381526380777359
Epoch 3, training loss 0.1811688095331192, validation loss 0.24073772132396698
Epoch 4, training loss 0.18142426013946533, validation loss 0.24262553453445435
Epoch 5, training loss 0.18137603998184204, validation loss 0.24316850304603577
Epoch 6, training loss 0.18109843134880066, validation loss 0.2425648421049118
Epoch 7, training loss 0.18090561032295227, validation loss 0.241700679063797
Epoch 8, training loss 0.18082751333713531, validation loss 0.24138084053993225
Epoch 9, training loss 0.18075741827487946, validation loss 0.24106821417808533
Epoch 10, training loss 0.1807030439376831, validation loss 0.24072396755218506
Epoch 11, training loss 0.18067608773708344, validation loss 0.240186870098114
Epoch 12, training loss 0.1806517392396927, va

In [74]:
print("Trained Weights and Bias:")
for name, param in model.named_parameters():
    print(name, param.data)

Trained Weights and Bias:
fc1.weight tensor([[-0.6318, -0.4980, -0.5736,  ..., -0.4873, -0.5354, -0.5277],
        [-0.2023, -0.1256, -0.4141,  ..., -0.1273, -0.0853, -0.3556],
        [-0.5871, -0.6607, -0.6977,  ..., -0.6359, -0.5577, -0.5733],
        ...,
        [-0.5932, -0.4417, -0.4105,  ..., -0.4394, -0.5163, -0.5137],
        [-0.3495, -0.2556, -0.2155,  ..., -0.2802, -0.0954, -0.2243],
        [-0.4182, -0.5935, -0.5672,  ..., -0.5582, -0.5737, -0.5587]])
fc1.bias tensor([-0.5901, -0.3308, -0.6020, -0.3135, -0.5553,  1.4424, -0.6000, -0.4846,
        -0.2364, -0.6109])
fc2.weight tensor([[-0.5778, -0.1988,  0.5601, -0.0260,  0.5166, -0.4950, -0.5350,  0.3654,
          0.1805, -0.5818]])
fc2.bias tensor([1.3718])
