# MLP

In [192]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim # optimizers
from torchsummary import summary

**Model**

In [193]:
model = nn.Sequential(
    nn.Linear(1, 1),
    nn.Linear(1, 1),
    nn.Sigmoid()
)

In [194]:
print(model)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
  (1): Linear(in_features=1, out_features=1, bias=True)
  (2): Sigmoid()
)


Before we start, let's define several terms that will be used in this notebook:

- `torch.cuda.is_available()`: This function checks whether or not CUDA (NVIDIA GPU) is available on computer.
- `torch.device(...)`: This function returns a device object representing the device that you want to run the tensor on. The conditional expression is used to determine whether to use GPU or CPU.
- `.to(device)`: This method moves all parameters and buffers of the model to selected device.

In [195]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 1))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 1]               2
            Linear-2                 [-1, 1, 1]               2
           Sigmoid-3                 [-1, 1, 1]               0
Total params: 4
Trainable params: 4
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [196]:
# '.children()' returns an iterator covering all the layers in the model
# '.state_dict()' returns the parameters of the layer
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.2248]], device='cuda:0')), ('bias', tensor([-0.5435], device='cuda:0'))])
OrderedDict([('weight', tensor([[0.2021]], device='cuda:0')), ('bias', tensor([-0.2123], device='cuda:0'))])
OrderedDict()


In [197]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Linear(2, 2),
    nn.Sigmoid()
)

In [198]:
print(model)

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
  (2): Sigmoid()
)


In [199]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
            Linear-2                 [-1, 1, 2]               6
           Sigmoid-3                 [-1, 1, 2]               0
Total params: 12
Trainable params: 12
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [200]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[ 0.7063, -0.4279],
        [ 0.6013,  0.4395]], device='cuda:0')), ('bias', tensor([-0.2925,  0.0813], device='cuda:0'))])
OrderedDict([('weight', tensor([[ 0.0709, -0.2933],
        [-0.5074,  0.2684]], device='cuda:0')), ('bias', tensor([ 0.5376, -0.3059], device='cuda:0'))])
OrderedDict()


**Sample**

In [201]:
x = torch.tensor([1.0, 2.0])
y = torch.tensor([0.0])

In [202]:
x, y

(tensor([1., 2.]), tensor([0.]))

### BCELoss

In [203]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [204]:
print(model)

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=1, bias=True)
  (2): Sigmoid()
)


In [205]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
            Linear-2                 [-1, 1, 1]               3
           Sigmoid-3                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [206]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[-0.1995, -0.4116],
        [-0.1752, -0.4569]], device='cuda:0')), ('bias', tensor([0.4787, 0.4670], device='cuda:0'))])
OrderedDict([('weight', tensor([[-0.2510,  0.6850]], device='cuda:0')), ('bias', tensor([0.0052], device='cuda:0'))])
OrderedDict()


In [207]:
# `.parameters()` returns an iterator over all the parameters of the model
# `.init.constant_()` initializes the parameters to a constant value
for p in model.parameters():
    nn.init.constant_(p, 0.1) # initialize all parameters to 0.1

In [208]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.1000, 0.1000],
        [0.1000, 0.1000]], device='cuda:0')), ('bias', tensor([0.1000, 0.1000], device='cuda:0'))])
OrderedDict([('weight', tensor([[0.1000, 0.1000]], device='cuda:0')), ('bias', tensor([0.1000], device='cuda:0'))])
OrderedDict()


In [209]:
x = x.to(device)
y_pred = model(x)
y_pred

tensor([0.5449], device='cuda:0', grad_fn=<SigmoidBackward0>)

**Activation**

In [210]:
import torch.nn as nn

act = nn.Sigmoid()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([0.5449, 0.4551])

In [211]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [212]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
           Sigmoid-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [213]:
import torch.nn as nn

act = nn.Tanh()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([ 0.1781, -0.1781])

In [214]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Tanh(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [215]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              Tanh-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [216]:
import torch.nn as nn

act = nn.ReLU()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([0.1800, 0.0000])

In [217]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.ReLU(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [218]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              ReLU-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


**Loss**

In [219]:
import torch.nn as nn
loss_fn = nn.BCELoss() # Binary Cross Entropy Loss

In [220]:
y_pred

tensor([0.5449], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [221]:
y

tensor([0.])

In [222]:
y = y.to(device)
loss = loss_fn(y_pred, y)
loss

tensor(0.7872, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)

In [223]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[-0.5667, -0.0692],
        [ 0.2444,  0.1418]], device='cuda:0')), ('bias', tensor([0.4790, 0.4418], device='cuda:0'))])
OrderedDict()
OrderedDict([('weight', tensor([[0.6743, 0.0602]], device='cuda:0')), ('bias', tensor([-0.5425], device='cuda:0'))])
OrderedDict()


**SGD**

In [224]:
learning_rate = 0.1
optimizer = optim.SGD(model.parameters(), learning_rate)

In [225]:
loss.backward() # compute gradients of all parameters

In [226]:
optimizer.step() # update all parameters

In [227]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[-0.5667, -0.0692],
        [ 0.2444,  0.1418]], device='cuda:0')), ('bias', tensor([0.4790, 0.4418], device='cuda:0'))])
OrderedDict()
OrderedDict([('weight', tensor([[0.6743, 0.0602]], device='cuda:0')), ('bias', tensor([-0.5425], device='cuda:0'))])
OrderedDict()


### CrossEntropyLoss

In [228]:
x = torch.tensor([1.0, 2.0])
y = torch.tensor(0)

In [229]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.ReLU(),
    nn.Linear(2, 2),
)

In [230]:
print(model)

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): ReLU()
  (2): Linear(in_features=2, out_features=2, bias=True)
)


In [231]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              ReLU-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 2]               6
Total params: 12
Trainable params: 12
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [232]:
x = x.to(device)
y_pred = model(x)
y_pred

tensor([0.0556, 1.5183], device='cuda:0', grad_fn=<ViewBackward0>)

In [233]:
import torch.nn as nn
loss_fn = nn.CrossEntropyLoss()

In [234]:
y = torch.tensor(0)
y

tensor(0)

In [235]:
y_pred

tensor([0.0556, 1.5183], device='cuda:0', grad_fn=<ViewBackward0>)

In [236]:
y = y.to(device)
loss_fn(y_pred, y)

tensor(1.6710, device='cuda:0', grad_fn=<NllLossBackward0>)

## Classification

In [237]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [238]:
data = load_iris()

In [239]:
data.data.shape

(150, 4)

In [240]:
data.target.shape

(150,)

In [241]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data.data,
    data.target,
    test_size=0.3
)

In [242]:
from sklearn.preprocessing import StandardScaler

In [243]:
scaler = StandardScaler()

In [244]:
scaler.fit(X_train)

In [245]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [246]:
X_train

array([[ 0.47596424,  0.59846315,  0.48376368,  0.50168462],
       [ 0.85172548, -0.29286494,  0.42571204,  0.09751415],
       [ 0.22545674, -0.29286494,  0.48376368,  0.23223764],
       [ 0.60121798, -0.51569697,  1.00622846,  1.31002556],
       [ 0.22545674, -0.96136101,  1.00622846,  0.23223764],
       [ 0.72647173, -0.51569697,  0.42571204,  0.36696113],
       [-0.02505075, -0.07003292,  0.19350547,  0.36696113],
       [-0.1503045 ,  2.1582873 , -1.54804379, -1.38444425],
       [ 0.100203  , -0.73852899,  0.71597025,  0.50168462],
       [-0.52606574, -1.18419303,  0.07740219,  0.09751415],
       [ 2.22951669, -0.96136101,  1.76089981,  1.44474905],
       [-0.90182698,  2.38111932, -1.37388886, -1.51916774],
       [ 1.35274046,  0.37563112,  0.48376368,  0.23223764],
       [-1.15233447, -1.62985708, -0.32895931, -0.30665633],
       [-0.1503045 , -0.73852899,  0.13545383, -0.30665633],
       [-0.90182698,  1.04412719, -1.37388886, -1.38444425],
       [ 1.60324796, -0.

In [247]:
X_test

array([[-0.40081199, -0.07003292,  0.13545383,  0.09751415],
       [ 0.100203  ,  0.82129517,  0.3676604 ,  0.50168462],
       [-1.15233447,  0.59846315, -1.43194051, -1.38444425],
       [-0.52606574, -1.62985708,  0.07740219,  0.09751415],
       [-0.52606574, -1.40702505, -0.09675274, -0.30665633],
       [-1.65334946,  1.26695921, -1.66414707, -1.38444425],
       [-1.15233447,  1.04412719, -1.48999215, -1.24972076],
       [-1.27758822,  1.26695921, -1.43194051, -1.51916774],
       [-1.15233447, -2.29835314, -0.21285602, -0.30665633],
       [-1.02708072,  1.71262326, -1.37388886, -1.24972076],
       [ 0.60121798,  0.37563112,  0.3676604 ,  0.36696113],
       [ 0.35071049,  0.82129517,  0.89012518,  1.44474905],
       [ 0.47596424,  0.82129517,  1.00622846,  1.57947254],
       [-1.15233447,  0.82129517, -1.31583722, -1.11499727],
       [ 0.60121798, -0.73852899,  0.83207354,  0.90585509],
       [ 0.72647173, -0.07003292,  1.12233175,  1.31002556],
       [ 0.35071049, -0.

In [248]:
X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train)
X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test)

In [249]:
X_train

tensor([[ 0.4760,  0.5985,  0.4838,  0.5017],
        [ 0.8517, -0.2929,  0.4257,  0.0975],
        [ 0.2255, -0.2929,  0.4838,  0.2322],
        [ 0.6012, -0.5157,  1.0062,  1.3100],
        [ 0.2255, -0.9614,  1.0062,  0.2322],
        [ 0.7265, -0.5157,  0.4257,  0.3670],
        [-0.0251, -0.0700,  0.1935,  0.3670],
        [-0.1503,  2.1583, -1.5480, -1.3844],
        [ 0.1002, -0.7385,  0.7160,  0.5017],
        [-0.5261, -1.1842,  0.0774,  0.0975],
        [ 2.2295, -0.9614,  1.7609,  1.4447],
        [-0.9018,  2.3811, -1.3739, -1.5192],
        [ 1.3527,  0.3756,  0.4838,  0.2322],
        [-1.1523, -1.6299, -0.3290, -0.3067],
        [-0.1503, -0.7385,  0.1355, -0.3067],
        [-0.9018,  1.0441, -1.3739, -1.3844],
        [ 1.6032, -0.0700,  1.1223,  0.5017],
        [ 1.2275,  0.1528,  0.5999,  0.3670],
        [-1.2776,  0.1528, -1.3739, -1.3844],
        [ 1.2275,  0.1528,  0.8901,  1.1753],
        [ 0.9770, -1.1842,  1.1223,  0.7711],
        [ 1.6032,  0.3756,  1.2384

In [250]:
Y_train

tensor([1, 1, 1, 2, 2, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 0, 2, 1, 0, 2, 2, 2, 0, 1,
        2, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 1, 0, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 0, 2, 1, 1,
        0, 2, 1, 1, 2, 0, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 0, 1, 2, 1, 0, 0, 0, 2,
        1, 2, 1, 2, 2, 1, 0, 2, 2])

In [251]:
model_classifier = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 3)
)

In [252]:
model_classifier

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=3, bias=True)
)

In [253]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_classifier.to(device)
summary(model_classifier, (1, 4))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 8]              40
              ReLU-2                 [-1, 1, 8]               0
            Linear-3                 [-1, 1, 3]              27
Total params: 67
Trainable params: 67
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [254]:
X_train[0]

tensor([0.4760, 0.5985, 0.4838, 0.5017])

In [255]:
X_train = X_train.to(device)
y_pred = model_classifier(X_train[0])
y_pred

tensor([-0.2962, -0.0119, -0.1632], device='cuda:0', grad_fn=<ViewBackward0>)

In [256]:
loss_fn = nn.CrossEntropyLoss()

In [257]:
loss = loss_fn(y_pred, Y_train[0].to(device))
loss

tensor(0.9601, device='cuda:0', grad_fn=<NllLossBackward0>)

In [258]:
loss.item() # get the value of the loss

0.9601325988769531

In [259]:
learning_rate = 0.01

optimizer = optim.SGD(
    model_classifier.parameters(),
    learning_rate
)

In [260]:
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = []
    for x_train, y_train in zip(X_train, Y_train):
        y_pred = model_classifier(x_train)
        y_train = y_train.to(device)
        loss = loss_fn(y_pred, y_train)
        epoch_loss.append(loss.item())

        optimizer.zero_grad() # clear the gradients of all optimized variables if we have updated the parameters in the previous section
        loss.backward()
        optimizer.step()
    losses.append(sum(epoch_loss)/len(epoch_loss))

In [261]:
losses

[0.904187036270187,
 0.7107491010001727,
 0.5910129173525742,
 0.5117832063209443,
 0.4535236190649725,
 0.4069728232387986,
 0.3682657722915922,
 0.33549302740111236,
 0.30784283644918886,
 0.2846665474275748,
 0.2652486963330635,
 0.24882603074823106,
 0.23482427697717434,
 0.22232241018763965,
 0.21136617539894012,
 0.20163773409578772,
 0.19278983989220466,
 0.18436377856858252,
 0.1766133730310858,
 0.1695677328372507]

In [262]:
# `torch.no_grad()` is a context manager that disables gradient calculation
with torch.no_grad():
    X_test = X_test.to(device)
    Y_pred = model_classifier(X_test)

In [263]:
Y_pred

tensor([[-1.0963,  1.5614, -0.9895],
        [-2.1194,  0.7168,  0.6181],
        [ 4.7627,  0.1898, -5.0888],
        [-1.6927,  2.3681, -1.1949],
        [-0.5753,  2.8401, -2.5014],
        [ 6.0964, -0.9717, -5.3072],
        [ 5.0336, -0.3957, -4.8429],
        [ 5.4586, -0.6178, -5.0506],
        [-0.5218,  3.3761, -2.9874],
        [ 5.1631, -1.0240, -4.4397],
        [-2.2655,  1.0153,  0.4578],
        [-4.0731, -0.1395,  2.8201],
        [-4.4146, -0.2458,  3.1632],
        [ 4.4992, -0.0673, -4.6188],
        [-4.2861,  0.5373,  2.3922],
        [-4.7584, -0.0575,  3.2406],
        [-3.5483,  0.7160,  1.6965],
        [-4.2253,  0.1123,  2.7076],
        [ 4.8310,  0.3315, -5.2399],
        [-0.4896,  1.7785, -1.6750],
        [-2.1847,  1.2028,  0.1813],
        [-4.7691, -0.3561,  3.5196],
        [-4.1114,  0.4197,  2.3519],
        [-4.6119, -0.2812,  3.3385],
        [ 4.9594,  0.8566, -5.7676],
        [-5.4098,  0.1226,  3.5620],
        [ 4.4148, -0.3923, -4.2575],
 

In [264]:
Y_pred = torch.argmax(Y_pred, dim=1)

In [265]:
Y_pred

tensor([1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2, 2, 0, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2,
        0, 2, 0, 1, 1, 0, 2, 0, 0, 2, 0, 2, 0, 1, 0, 0, 1, 2, 0, 1, 0],
       device='cuda:0')

In [268]:
Y_test = Y_test.to(device)
sum(Y_pred == Y_test)/len(Y_test)

tensor(0.9778, device='cuda:0')