<h2> Numpy </h2>

In [2]:
import numpy as np

In [6]:
# A neural-net with 1 hidden layer

N, D_in, H, D_out = 64, 1000, 100, 10
# N: Number of Features
# D_in: Input Dimension
# H: Hidden nodes in the hidden layer
# D_out: Output Dimension

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialized weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [9]:
learning_rate = 1e-6
epochs = 500
for epoch in range(epochs):
    
    # Forward Pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute Loss
    loss = np.square(y_pred - y).sum()
    print(f'Epoch: {epoch}, Loss: {loss}')
    
    # Backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update Weights
    w1 = w1 - learning_rate*grad_w1
    w2 = w2 - learning_rate*grad_w2

print()
print("Trainging Completed!")

Epoch: 0, Loss: 29226601.696511146
Epoch: 1, Loss: 27775230.500787932
Epoch: 2, Loss: 32901877.885324012
Epoch: 3, Loss: 39359838.92943065
Epoch: 4, Loss: 40644664.84219085
Epoch: 5, Loss: 31936544.962200645
Epoch: 6, Loss: 18575853.506086703
Epoch: 7, Loss: 8502076.799694864
Epoch: 8, Loss: 3760055.349203526
Epoch: 9, Loss: 1915213.3585328672
Epoch: 10, Loss: 1208211.6219417704
Epoch: 11, Loss: 893834.0018445372
Epoch: 12, Loss: 720143.9245465131
Epoch: 13, Loss: 603542.5081034406
Epoch: 14, Loss: 515443.9639568868
Epoch: 15, Loss: 444762.2620040415
Epoch: 16, Loss: 386480.6724243282
Epoch: 17, Loss: 337636.38506803394
Epoch: 18, Loss: 296337.26443553646
Epoch: 19, Loss: 261147.31038847662
Epoch: 20, Loss: 230979.2963074111
Epoch: 21, Loss: 204996.64103241754
Epoch: 22, Loss: 182530.0721639802
Epoch: 23, Loss: 163000.59182206675
Epoch: 24, Loss: 145972.37122345876
Epoch: 25, Loss: 131074.5313573772
Epoch: 26, Loss: 117986.3323500847
Epoch: 27, Loss: 106454.59454623495
Epoch: 28, Loss:

Epoch: 243, Loss: 0.8289529182243793
Epoch: 244, Loss: 0.7944271599077452
Epoch: 245, Loss: 0.7613712186160564
Epoch: 246, Loss: 0.729715333463439
Epoch: 247, Loss: 0.6994165501277179
Epoch: 248, Loss: 0.6703832398199424
Epoch: 249, Loss: 0.6425836700181473
Epoch: 250, Loss: 0.6159533986347185
Epoch: 251, Loss: 0.590451616973203
Epoch: 252, Loss: 0.566016653611197
Epoch: 253, Loss: 0.542617261529568
Epoch: 254, Loss: 0.5202015526477977
Epoch: 255, Loss: 0.49872603316035274
Epoch: 256, Loss: 0.4781570053515941
Epoch: 257, Loss: 0.45845268091571756
Epoch: 258, Loss: 0.4395714738660234
Epoch: 259, Loss: 0.4214985724494855
Epoch: 260, Loss: 0.40416561059112577
Epoch: 261, Loss: 0.38755712875159726
Epoch: 262, Loss: 0.37164754177049397
Epoch: 263, Loss: 0.3564009431432592
Epoch: 264, Loss: 0.34178772329695384
Epoch: 265, Loss: 0.32778363530547516
Epoch: 266, Loss: 0.3143624893240956
Epoch: 267, Loss: 0.30150176809017076
Epoch: 268, Loss: 0.28917633141506716
Epoch: 269, Loss: 0.2773611638203

Epoch: 489, Loss: 4.3529340061007974e-05
Epoch: 490, Loss: 4.1873078149851645e-05
Epoch: 491, Loss: 4.0279957626067255e-05
Epoch: 492, Loss: 3.874858737443376e-05
Epoch: 493, Loss: 3.7276119435256946e-05
Epoch: 494, Loss: 3.5858513545025435e-05
Epoch: 495, Loss: 3.4494881633981314e-05
Epoch: 496, Loss: 3.3183281085572966e-05
Epoch: 497, Loss: 3.1921805010899e-05
Epoch: 498, Loss: 3.070827813639126e-05
Epoch: 499, Loss: 2.9541063889293284e-05

Trainging Completed!


<h2> Pytorch </h2>

In [10]:
import torch

In [12]:
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
epochs = 500

for epoch in range(epochs):
    
    # Forward Pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute Loss
    loss = (y_pred - y).pow(2).sum().item()
    print(f'Epoch: {epoch}, Loss: {loss}')
    
    # Backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update Weights
    w1 = w1 - learning_rate*grad_w1
    w2 = w2 - learning_rate*grad_w2

print()
print("Trainging Completed!")

Epoch: 0, Loss: 30065432.0
Epoch: 1, Loss: 24946660.0
Epoch: 2, Loss: 22583212.0
Epoch: 3, Loss: 19880720.0
Epoch: 4, Loss: 16071007.0
Epoch: 5, Loss: 11673027.0
Epoch: 6, Loss: 7813670.5
Epoch: 7, Loss: 4992822.5
Epoch: 8, Loss: 3195684.75
Epoch: 9, Loss: 2114841.0
Epoch: 10, Loss: 1477107.125
Epoch: 11, Loss: 1091089.375
Epoch: 12, Loss: 846850.4375
Epoch: 13, Loss: 682799.75
Epoch: 14, Loss: 565926.875
Epoch: 15, Loss: 478260.09375
Epoch: 16, Loss: 409849.0
Epoch: 17, Loss: 354675.25
Epoch: 18, Loss: 309195.65625
Epoch: 19, Loss: 271094.71875
Epoch: 20, Loss: 238801.359375
Epoch: 21, Loss: 211180.796875
Epoch: 22, Loss: 187394.15625
Epoch: 23, Loss: 166797.4375
Epoch: 24, Loss: 148877.828125
Epoch: 25, Loss: 133222.78125
Epoch: 26, Loss: 119481.8828125
Epoch: 27, Loss: 107394.4375
Epoch: 28, Loss: 96718.015625
Epoch: 29, Loss: 87265.1796875
Epoch: 30, Loss: 78892.8203125
Epoch: 31, Loss: 71460.0
Epoch: 32, Loss: 64831.4296875
Epoch: 33, Loss: 58903.3671875
Epoch: 34, Loss: 53590.867

Epoch: 302, Loss: 0.012874260544776917
Epoch: 303, Loss: 0.01229324284940958
Epoch: 304, Loss: 0.011737623251974583
Epoch: 305, Loss: 0.011210412718355656
Epoch: 306, Loss: 0.010706169530749321
Epoch: 307, Loss: 0.010224826633930206
Epoch: 308, Loss: 0.009771803393959999
Epoch: 309, Loss: 0.009332502260804176
Epoch: 310, Loss: 0.008913283236324787
Epoch: 311, Loss: 0.008516935631632805
Epoch: 312, Loss: 0.008139193058013916
Epoch: 313, Loss: 0.007786821573972702
Epoch: 314, Loss: 0.007447198033332825
Epoch: 315, Loss: 0.007119754794985056
Epoch: 316, Loss: 0.006801928393542767
Epoch: 317, Loss: 0.006504198536276817
Epoch: 318, Loss: 0.00622045760974288
Epoch: 319, Loss: 0.005949618294835091
Epoch: 320, Loss: 0.005696217529475689
Epoch: 321, Loss: 0.005445469170808792
Epoch: 322, Loss: 0.005211887415498495
Epoch: 323, Loss: 0.004988144151866436
Epoch: 324, Loss: 0.004773580469191074
Epoch: 325, Loss: 0.004569935146719217
Epoch: 326, Loss: 0.004375319927930832
Epoch: 327, Loss: 0.0041891

<h2> Autograd </h2>

In [13]:
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
epochs = 500
for epoch in range(epochs):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(f'Epoch: {epoch}, Loss: {loss}')
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

Epoch: 0, Loss: 31235812.0
Epoch: 1, Loss: 26913306.0
Epoch: 2, Loss: 24943110.0
Epoch: 3, Loss: 21899448.0
Epoch: 4, Loss: 17133396.0
Epoch: 5, Loss: 11782619.0
Epoch: 6, Loss: 7369305.0
Epoch: 7, Loss: 4426737.0
Epoch: 8, Loss: 2703447.0
Epoch: 9, Loss: 1744786.25
Epoch: 10, Loss: 1208051.125
Epoch: 11, Loss: 894194.25
Epoch: 12, Loss: 697828.5625
Epoch: 13, Loss: 565597.0
Epoch: 14, Loss: 470377.8125
Epoch: 15, Loss: 398110.90625
Epoch: 16, Loss: 341076.5
Epoch: 17, Loss: 294913.4375
Epoch: 18, Loss: 256801.765625
Epoch: 19, Loss: 224878.53125
Epoch: 20, Loss: 197898.296875
Epoch: 21, Loss: 174912.71875
Epoch: 22, Loss: 155183.265625
Epoch: 23, Loss: 138141.078125
Epoch: 24, Loss: 123324.59375
Epoch: 25, Loss: 110422.7734375
Epoch: 26, Loss: 99133.25
Epoch: 27, Loss: 89219.609375
Epoch: 28, Loss: 80486.4609375
Epoch: 29, Loss: 72765.4609375
Epoch: 30, Loss: 65922.359375
Epoch: 31, Loss: 59837.7265625
Epoch: 32, Loss: 54412.5234375
Epoch: 33, Loss: 49565.515625
Epoch: 34, Loss: 45227

Epoch: 264, Loss: 0.20005318522453308
Epoch: 265, Loss: 0.1910528838634491
Epoch: 266, Loss: 0.1825190931558609
Epoch: 267, Loss: 0.17434556782245636
Epoch: 268, Loss: 0.16649800539016724
Epoch: 269, Loss: 0.1590399444103241
Epoch: 270, Loss: 0.15191136300563812
Epoch: 271, Loss: 0.14507660269737244
Epoch: 272, Loss: 0.13858206570148468
Epoch: 273, Loss: 0.13239049911499023
Epoch: 274, Loss: 0.12647323310375214
Epoch: 275, Loss: 0.12082572281360626
Epoch: 276, Loss: 0.11542566865682602
Epoch: 277, Loss: 0.110251285135746
Epoch: 278, Loss: 0.10533180832862854
Epoch: 279, Loss: 0.10061535984277725
Epoch: 280, Loss: 0.09611997753381729
Epoch: 281, Loss: 0.09183663129806519
Epoch: 282, Loss: 0.08771681785583496
Epoch: 283, Loss: 0.08378297835588455
Epoch: 284, Loss: 0.08001967519521713
Epoch: 285, Loss: 0.07646447420120239
Epoch: 286, Loss: 0.07305697351694107
Epoch: 287, Loss: 0.0698089599609375
Epoch: 288, Loss: 0.06668955087661743
Epoch: 289, Loss: 0.06372467428445816
Epoch: 290, Loss: 

<h2> Defining Custom Autograd Functions </h2>

In [15]:
class CustomReLU(torch.autograd.Function):
    
    # Forward Prop
    @staticmethod
    def forward(ctx, input):
        # ctx is a context object that can be used to stash information for backward computation
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    # Backward Prop
    @staticmethod
    def backward(ctx, grad_output):
        # we receive a Tensor containing the gradient of the loss with respect to the output, 
        # and we need to compute the gradient of the loss with respect to the input.
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
epochs = 500
for epoch in range(epochs):
    relu = CustomReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(f'Epoch: {epoch}, Loss: {loss}')
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()
        
#     OR Can be re-written as 
#     with torch.no_grad():
#         for param in model.parameters():
#             param -= learning_rate * param.grad

#     OR use nn.optim
#     optimizer.step()
        

Epoch: 0, Loss: 27258074.0
Epoch: 1, Loss: 20576538.0
Epoch: 2, Loss: 17832498.0
Epoch: 3, Loss: 16163392.0
Epoch: 4, Loss: 14350665.0
Epoch: 5, Loss: 12047126.0
Epoch: 6, Loss: 9442508.0
Epoch: 7, Loss: 6960259.0
Epoch: 8, Loss: 4910156.5
Epoch: 9, Loss: 3392632.75
Epoch: 10, Loss: 2344530.0
Epoch: 11, Loss: 1647910.375
Epoch: 12, Loss: 1190799.5
Epoch: 13, Loss: 889748.0
Epoch: 14, Loss: 687722.875
Epoch: 15, Loss: 548196.625
Epoch: 16, Loss: 448501.9375
Epoch: 17, Loss: 374679.5625
Epoch: 18, Loss: 318117.75
Epoch: 19, Loss: 273523.15625
Epoch: 20, Loss: 237490.359375
Epoch: 21, Loss: 207735.40625
Epoch: 22, Loss: 182765.703125
Epoch: 23, Loss: 161584.09375
Epoch: 24, Loss: 143412.765625
Epoch: 25, Loss: 127694.7734375
Epoch: 26, Loss: 114030.2109375
Epoch: 27, Loss: 102091.828125
Epoch: 28, Loss: 91599.421875
Epoch: 29, Loss: 82349.0
Epoch: 30, Loss: 74167.2890625
Epoch: 31, Loss: 66908.8125
Epoch: 32, Loss: 60455.609375
Epoch: 33, Loss: 54704.98046875
Epoch: 34, Loss: 49571.175781

Epoch: 257, Loss: 0.029976539313793182
Epoch: 258, Loss: 0.028365271165966988
Epoch: 259, Loss: 0.026823660358786583
Epoch: 260, Loss: 0.025384051725268364
Epoch: 261, Loss: 0.024018622934818268
Epoch: 262, Loss: 0.022726718336343765
Epoch: 263, Loss: 0.02151150442659855
Epoch: 264, Loss: 0.02034764364361763
Epoch: 265, Loss: 0.019266806542873383
Epoch: 266, Loss: 0.01823907159268856
Epoch: 267, Loss: 0.01726582646369934
Epoch: 268, Loss: 0.016346994787454605
Epoch: 269, Loss: 0.015473298728466034
Epoch: 270, Loss: 0.014658837579190731
Epoch: 271, Loss: 0.013873693533241749
Epoch: 272, Loss: 0.013134714215993881
Epoch: 273, Loss: 0.01243154052644968
Epoch: 274, Loss: 0.011777171865105629
Epoch: 275, Loss: 0.011159149929881096
Epoch: 276, Loss: 0.010571870021522045
Epoch: 277, Loss: 0.010015539824962616
Epoch: 278, Loss: 0.00948293786495924
Epoch: 279, Loss: 0.008994445204734802
Epoch: 280, Loss: 0.008525308221578598
Epoch: 281, Loss: 0.008082233369350433
Epoch: 282, Loss: 0.00765796704

Epoch: 471, Loss: 3.3638192689977586e-05
Epoch: 472, Loss: 3.324737190268934e-05
Epoch: 473, Loss: 3.295196802355349e-05
Epoch: 474, Loss: 3.249936708016321e-05
Epoch: 475, Loss: 3.2207037293119356e-05
Epoch: 476, Loss: 3.193011798430234e-05
Epoch: 477, Loss: 3.15823926939629e-05
Epoch: 478, Loss: 3.121673580608331e-05
Epoch: 479, Loss: 3.07527334371116e-05
Epoch: 480, Loss: 3.04875684378203e-05
Epoch: 481, Loss: 3.0420123948715627e-05
Epoch: 482, Loss: 3.0011771741556004e-05
Epoch: 483, Loss: 2.9677661586902104e-05
Epoch: 484, Loss: 2.927231980720535e-05
Epoch: 485, Loss: 2.8924987418577075e-05
Epoch: 486, Loss: 2.857997787941713e-05
Epoch: 487, Loss: 2.828560354828369e-05
Epoch: 488, Loss: 2.7960633815382607e-05
Epoch: 489, Loss: 2.775582470349036e-05
Epoch: 490, Loss: 2.7568728910409845e-05
Epoch: 491, Loss: 2.730672713369131e-05
Epoch: 492, Loss: 2.695326111279428e-05
Epoch: 493, Loss: 2.6647006961866282e-05
Epoch: 494, Loss: 2.6458346837898716e-05
Epoch: 495, Loss: 2.6116176741197