In [22]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim

In [2]:
df = pd.read_csv("Prodigy University Dataset.csv")
df.head()

Unnamed: 0,sat_sum,hs_gpa,fy_gpa
0,508,3.4,3.18
1,488,4.0,3.33
2,464,3.75,3.25
3,380,3.75,2.42
4,428,4.0,2.63


In [3]:
X = df[["sat_sum","hs_gpa"]].values
y = df["fy_gpa"].values.reshape(-1,1)

In [4]:
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [6]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype= torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [12]:
def create_model():
    model = nn.Sequential(nn.Linear(2,2),
                      nn.Sigmoid(),
                      nn.Linear(2,1))
    return model
model = create_model()
criteria = nn.MSELoss()
preds = model(X_train_tensor)
initial_loss = criteria(preds,y_train_tensor)
print(initial_loss)

tensor(5.3856, grad_fn=<MseLossBackward0>)


In [13]:
model[0].weight
model[2].weight

Parameter containing:
tensor([[-0.1314, -0.2272]], requires_grad=True)

### Single epoch of SGD optimizer manually

In [14]:
model = create_model()
optimizer = optim.SGD(model.parameters(),lr = 0.001)
initial_loss.backward()
optimizer.step()

model[0].weight
model[2].weight

Parameter containing:
tensor([[-0.3356, -0.4660]], requires_grad=True)

## Model without training

In [23]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
## performance of train and test before model training
train_loss = criteria(model(X_train_tensor), y_train_tensor)
test_loss = criteria(model(X_test_tensor),y_test_tensor)
print(f"Model without training: \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Model without training: 
 Train Loss : 5.8944 Test Loss : 6.1484


## Stochastic Gradient Descent

In [24]:
train_loader = DataLoader(train_data, batch_size=1, shuffle = True)
for epoch in range(10):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    ## performance of train and test before model training
    train_loss = criteria(model(X_train_tensor), y_train_tensor)
    test_loss = criteria(model(X_test_tensor),y_test_tensor)
    print(f"Epoch : {epoch} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")
    
    

Epoch : 0 
 Train Loss : 0.5482 Test Loss : 0.6130
Epoch : 1 
 Train Loss : 0.4784 Test Loss : 0.5306
Epoch : 2 
 Train Loss : 0.4395 Test Loss : 0.4888
Epoch : 3 
 Train Loss : 0.4119 Test Loss : 0.4622
Epoch : 4 
 Train Loss : 0.3919 Test Loss : 0.4404
Epoch : 5 
 Train Loss : 0.3785 Test Loss : 0.4284
Epoch : 6 
 Train Loss : 0.3692 Test Loss : 0.4176
Epoch : 7 
 Train Loss : 0.3634 Test Loss : 0.4114
Epoch : 8 
 Train Loss : 0.3586 Test Loss : 0.4084
Epoch : 9 
 Train Loss : 0.3557 Test Loss : 0.4081


## Batch Gradient Descent

In [26]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
train_loader = DataLoader(train_data, batch_size = 800, shuffle=True)#800 is no of train samples

for epoch in range(1000):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 100 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 0 
 Train Loss : 5.2699 Test Loss : 5.5114
Epoch : 100 
 Train Loss : 3.1497 Test Loss : 3.3442
Epoch : 200 
 Train Loss : 2.0168 Test Loss : 2.1771
Epoch : 300 
 Train Loss : 1.4075 Test Loss : 1.5427
Epoch : 400 
 Train Loss : 1.0785 Test Loss : 1.1953
Epoch : 500 
 Train Loss : 0.8998 Test Loss : 1.0029
Epoch : 600 
 Train Loss : 0.8014 Test Loss : 0.8945
Epoch : 700 
 Train Loss : 0.7460 Test Loss : 0.8316
Epoch : 800 
 Train Loss : 0.7135 Test Loss : 0.7936
Epoch : 900 
 Train Loss : 0.6932 Test Loss : 0.7692


## Mini batch gradient Descent

In [28]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.5167 Test Loss : 0.5904
Epoch : 100 
 Train Loss : 0.4342 Test Loss : 0.4893
Epoch : 150 
 Train Loss : 0.4112 Test Loss : 0.4639
Epoch : 200 
 Train Loss : 0.3947 Test Loss : 0.4473
Epoch : 250 
 Train Loss : 0.3826 Test Loss : 0.4354
Epoch : 300 
 Train Loss : 0.3737 Test Loss : 0.4268
Epoch : 350 
 Train Loss : 0.3670 Test Loss : 0.4206
Epoch : 400 
 Train Loss : 0.3621 Test Loss : 0.4162
Epoch : 450 
 Train Loss : 0.3585 Test Loss : 0.4128
Epoch : 500 
 Train Loss : 0.3557 Test Loss : 0.4104


## Stochastic Gradient Descent with momentum
Approach : Finding moving average of gradients
* Accelerates SGD
* Dampens the turbulence
* Special used when a loss function has multiple local minima but one global minima
* Helps in arriving global minima
* w_new = w_old - learning_rate * exponential moving average

In [29]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
# Added Momentum of 0.9
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.3579 Test Loss : 0.4170
Epoch : 100 
 Train Loss : 0.3483 Test Loss : 0.4077
Epoch : 150 
 Train Loss : 0.3463 Test Loss : 0.4054
Epoch : 200 
 Train Loss : 0.3451 Test Loss : 0.4042
Epoch : 250 
 Train Loss : 0.3443 Test Loss : 0.4032
Epoch : 300 
 Train Loss : 0.3436 Test Loss : 0.4026
Epoch : 350 
 Train Loss : 0.3431 Test Loss : 0.4025
Epoch : 400 
 Train Loss : 0.3426 Test Loss : 0.4023
Epoch : 450 
 Train Loss : 0.3423 Test Loss : 0.4019
Epoch : 500 
 Train Loss : 0.3420 Test Loss : 0.4021


## Nesterov Momentum

Refines momentum by looking at gradient of future steps
Set nestrov = True in optimizer along with momentum of 0.9

In [30]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9, nesterov=True)
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.3619 Test Loss : 0.4183
Epoch : 100 
 Train Loss : 0.3493 Test Loss : 0.4071
Epoch : 150 
 Train Loss : 0.3470 Test Loss : 0.4051
Epoch : 200 
 Train Loss : 0.3457 Test Loss : 0.4039
Epoch : 250 
 Train Loss : 0.3448 Test Loss : 0.4028
Epoch : 300 
 Train Loss : 0.3440 Test Loss : 0.4027
Epoch : 350 
 Train Loss : 0.3434 Test Loss : 0.4019
Epoch : 400 
 Train Loss : 0.3429 Test Loss : 0.4015
Epoch : 450 
 Train Loss : 0.3424 Test Loss : 0.4013
Epoch : 500 
 Train Loss : 0.3421 Test Loss : 0.4011


## AdaGrad : 
Adaptive Gradient Descent uses different learning rates for each iteration in updating weights based on historical gradients.
* Parameters with infrequent updates --> Bigger updates
* Parameters with frequent updates --> Smaller updates
* Used in Sparse Datasets like Image and text datasets
* Problem: It my reduce learning rates aggressively resulting in sub-optimal training.

In [31]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
# Added Momentum of 0.9
optimizer = optim.Adagrad(model.parameters())
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.8502 Test Loss : 0.9670
Epoch : 100 
 Train Loss : 0.4995 Test Loss : 0.5887
Epoch : 150 
 Train Loss : 0.3973 Test Loss : 0.4714
Epoch : 200 
 Train Loss : 0.3662 Test Loss : 0.4319
Epoch : 250 
 Train Loss : 0.3564 Test Loss : 0.4176
Epoch : 300 
 Train Loss : 0.3531 Test Loss : 0.4120
Epoch : 350 
 Train Loss : 0.3518 Test Loss : 0.4094
Epoch : 400 
 Train Loss : 0.3512 Test Loss : 0.4081
Epoch : 450 
 Train Loss : 0.3509 Test Loss : 0.4074
Epoch : 500 
 Train Loss : 0.3506 Test Loss : 0.4071


## RMSProp- Root Mean Square Propagation
Accelerates the optimization process by reducing the number of updates needed to reach the minima. jUST like Momentum. But, if a parameter loss function oscillates the prediction we should penalize the model.

In [32]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
# Added Momentum of 0.9
optimizer = optim.RMSprop(model.parameters())
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.3420 Test Loss : 0.4021
Epoch : 100 
 Train Loss : 0.3402 Test Loss : 0.3992
Epoch : 150 
 Train Loss : 0.3397 Test Loss : 0.4012
Epoch : 200 
 Train Loss : 0.3404 Test Loss : 0.3994
Epoch : 250 
 Train Loss : 0.3389 Test Loss : 0.3999
Epoch : 300 
 Train Loss : 0.3396 Test Loss : 0.4038
Epoch : 350 
 Train Loss : 0.3388 Test Loss : 0.4006
Epoch : 400 
 Train Loss : 0.3387 Test Loss : 0.4005
Epoch : 450 
 Train Loss : 0.3387 Test Loss : 0.4017
Epoch : 500 
 Train Loss : 0.3390 Test Loss : 0.4000


## ADAM Optimizer
Adaptive movement Estimation is a combination of RMSProp and Momentum.

In [33]:
train_data = TensorDataset(X_train_tensor,y_train_tensor)
model = create_model()
# Added Momentum of 0.9
optimizer = optim.Adam(model.parameters())
train_loader = DataLoader(train_data, batch_size = 64, shuffle=True)#800 is no of train samples

for epoch in range(500):
    for X_batch,y_batch in train_loader:
        #forward pass
        pred = model(X_batch)
        loss = criteria(pred,y_batch)
        
        #backward pass and optimization
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 50 == 0:
        ## performance of train and test before model training
        train_loss = criteria(model(X_train_tensor), y_train_tensor)
        test_loss = criteria(model(X_test_tensor),y_test_tensor)
        print(f"Epoch : {epoch+1} \n Train Loss : {train_loss:0.4f} Test Loss : {test_loss:0.4f}")

Epoch : 50 
 Train Loss : 0.9142 Test Loss : 1.0298
Epoch : 100 
 Train Loss : 0.3704 Test Loss : 0.4317
Epoch : 150 
 Train Loss : 0.3569 Test Loss : 0.4103
Epoch : 200 
 Train Loss : 0.3554 Test Loss : 0.4090
Epoch : 250 
 Train Loss : 0.3536 Test Loss : 0.4078
Epoch : 300 
 Train Loss : 0.3517 Test Loss : 0.4063
Epoch : 350 
 Train Loss : 0.3496 Test Loss : 0.4054
Epoch : 400 
 Train Loss : 0.3477 Test Loss : 0.4044
Epoch : 450 
 Train Loss : 0.3461 Test Loss : 0.4031
Epoch : 500 
 Train Loss : 0.3448 Test Loss : 0.4027
