# Fundamentals of Pytorch

In [None]:
import torch

In [None]:
X = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
X

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [None]:
X.shape

torch.Size([2, 3])

In [None]:
X.dtype

torch.float32

In [None]:
X[0, 2]

tensor(3.)

In [None]:
print(X.mean())
print(X.sum())
print(X.std())
print(X.abs())
print(X.argmax())
print(X.exp())
print(X.sqrt())

tensor(3.5000)
tensor(21.)
tensor(1.8708)
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor(5)
tensor([[  2.7183,   7.3891,  20.0855],
        [ 54.5981, 148.4132, 403.4288]])
tensor([[1.0000, 1.4142, 1.7321],
        [2.0000, 2.2361, 2.4495]])


In [None]:
10 * (X + 3)

tensor([[40., 50., 60.],
        [70., 80., 90.]])

In [None]:
X.T @ X

tensor([[17., 22., 27.],
        [22., 29., 36.],
        [27., 36., 45.]])

### Convert it to numpy array

In [None]:
import numpy as np
X.numpy()

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

In [None]:
Y = np.array([[77, 88, 99], [10, 11, 12]])
Y

array([[77, 88, 99],
       [10, 11, 12]])

In [None]:
torch.tensor(Y) # default float presision is float64

tensor([[77, 88, 99],
        [10, 11, 12]])

In [None]:
torch.FloatTensor(Y) # specify float32

tensor([[77., 88., 99.],
        [10., 11., 12.]])

### Inplace Operations

In [None]:
X.relu_()

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [None]:
X.sqrt_()

tensor([[1.0000, 1.4142, 1.7321],
        [2.0000, 2.2361, 2.4495]])

In [None]:
X.zero_()

tensor([[0., 0., 0.],
        [0., 0., 0.]])

### Hardware Acceleration

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

print(f'Using device: {device}')

Using device: cuda


Convert tensor-cpu to tensor-gpu

In [None]:
M = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
M = M.to(device)
M.device

device(type='cuda', index=0)

In [None]:
# Directly create tensor on GPU
M = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device=device)
M.device

device(type='cuda', index=0)

In [None]:
R = M @ M.T
R # the result tensor is also on the GPU

tensor([[14., 32.],
        [32., 77.]], device='cuda:0')

In [None]:
K = torch.randn((1000, 1000))
%timeit K @ K.T


15.5 ms ± 3.25 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# run this on gpu or google colab with gpu enabled
K = torch.randn((1000, 1000), device='cuda')
%timeit K @ K.T


647 µs ± 12.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### AutoGrad

In [None]:
z = torch.tensor(5.0, requires_grad=True)
f = z**3 + z**2 + z
f

tensor(155., grad_fn=<AddBackward0>)

In [None]:
f.backward()
z.grad

tensor(86.)

In [None]:
# for sin function + exponent
y = torch.tensor(2.0, requires_grad=True)
f = 2 * torch.sin(y) + 10 * torch.exp(y)
f

tensor(75.7092, grad_fn=<AddBackward0>)

In [None]:
f.backward()
z.grad

tensor(86.)

In [None]:
x = torch.tensor(5.0, requires_grad=True)
f = x ** 2
f


tensor(25., grad_fn=<PowBackward0>)

In [None]:
f.backward()
x.grad

tensor(10.)

In [None]:
learning_rate = 0.1
with torch.no_grad():
    x = x - learning_rate * x.grad
x

tensor(4.)

In [None]:
x = torch.tensor(5.0, requires_grad=True)
f = x ** 2
f.backward()
x.grad
x_detached = x.detach()
x_detached = x_detached - learning_rate * x.grad
x_detached

tensor(4.)

In [None]:
learning_rate = 0.1
x = torch.tensor(5.0, requires_grad=True)

for i in range(10):
    f = x ** 2
    f.backward()
    with torch.no_grad():
        x -= learning_rate * x.grad
    x.grad.zero_()
    print(f'Iteration {i+1}: x = {x.item()}, f(x) = {f.item()}')

x

Iteration 1: x = 4.0, f(x) = 25.0
Iteration 2: x = 3.200000047683716, f(x) = 16.0
Iteration 3: x = 2.559999942779541, f(x) = 10.24000072479248
Iteration 4: x = 2.047999858856201, f(x) = 6.553599834442139
Iteration 5: x = 1.6383998394012451, f(x) = 4.194303512573242
Iteration 6: x = 1.3107198476791382, f(x) = 2.684354066848755
Iteration 7: x = 1.0485758781433105, f(x) = 1.7179864645004272
Iteration 8: x = 0.8388606905937195, f(x) = 1.0995113849639893
Iteration 9: x = 0.6710885763168335, f(x) = 0.7036872506141663
Iteration 10: x = 0.5368708372116089, f(x) = 0.45035988092422485


tensor(0.5369, requires_grad=True)

### Linear Regression Using Tensor and AutoGrad - Low Level API

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test =  train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

In [None]:
# Convert to torch tensors
X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)

# Means and Standard Deviation
means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)

# Normalization of data
X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds
X_test = (X_test - means) / stds

In [None]:
y_train = torch.FloatTensor(y_train.reshape(-1, 1))
y_test= torch.FloatTensor(y_test.reshape(-1, 1))
y_valid = torch.FloatTensor(y_valid.reshape(-1, 1))

In [None]:
torch.manual_seed(42)
n_features = X_train.shape[1]
# initiazlize the weights and bias
w = torch.randn((n_features,1), requires_grad=True)
b = torch.tensor(0., requires_grad=True)


In [None]:
# Train the model
learning_rate = 0.4
n_epochs = 20

for epoch in range(n_epochs):
    y_pred = X_train @ w + b
    loss = ((y_pred - y_train) ** 2).mean()
    loss.backward()
    with torch.no_grad():
        b -= learning_rate * b.grad
        w -= learning_rate * w.grad

        b.grad.zero_()
        w.grad.zero_()
    print(f'Epoch {epoch + 1} / {n_epochs}, Loss: {loss.item()} ')

Epoch 1 / 20, Loss: 16.006189346313477 
Epoch 2 / 20, Loss: 4.656647682189941 
Epoch 3 / 20, Loss: 2.1048688888549805 
Epoch 4 / 20, Loss: 1.2392672300338745 
Epoch 5 / 20, Loss: 0.9124190211296082 
Epoch 6 / 20, Loss: 0.777962327003479 
Epoch 7 / 20, Loss: 0.7152509689331055 
Epoch 8 / 20, Loss: 0.6805717349052429 
Epoch 9 / 20, Loss: 0.6576955318450928 
Epoch 10 / 20, Loss: 0.6404300928115845 
Epoch 11 / 20, Loss: 0.6263095140457153 
Epoch 12 / 20, Loss: 0.6142741441726685 
Epoch 13 / 20, Loss: 0.6038088798522949 
Epoch 14 / 20, Loss: 0.5946188569068909 
Epoch 15 / 20, Loss: 0.5865060687065125 
Epoch 16 / 20, Loss: 0.5793203711509705 
Epoch 17 / 20, Loss: 0.5729406476020813 
Epoch 18 / 20, Loss: 0.5672647356987 
Epoch 19 / 20, Loss: 0.5622054934501648 
Epoch 20 / 20, Loss: 0.5576879382133484 


In [None]:
# make predications
X_new = X_test[:3]
with torch.no_grad():
    y_pred = X_new @ w + b

y_pred

tensor([[0.9118],
        [1.6231],
        [2.6630]])

Implement Linear Regression through High Level API

In [None]:
import torch.nn as nn

torch.manual_seed(42)
model = nn.Linear(in_features=n_features, out_features=1)

In [None]:
# Model Bias
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [None]:
# Model Weight
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [None]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [None]:
# name_parameters(): pair of name and value
for name_param in model.named_parameters():
    print(name_param[0] , ' => ', name_param[1])

weight  =>  Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
bias  =>  Parameter containing:
tensor([0.3117], requires_grad=True)


In [None]:
# Call this model as normal function
model(X_train[:3])

tensor([[ 0.4296],
        [ 1.1455],
        [-0.2709]], grad_fn=<AddmmBackward0>)

In [None]:
# optimizers
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

In [None]:
# train model function
def train_model(model, optimizer, criterion, X_train, y_train,n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}")

In [None]:
# call the train model function
train_model(model, optimizer,mse, X_train, y_train, n_epochs)

Epoch 1/20, Loss: 4.272577285766602
Epoch 2/20, Loss: 0.767361581325531
Epoch 3/20, Loss: 0.6151816248893738
Epoch 4/20, Loss: 0.5950987935066223
Epoch 5/20, Loss: 0.5839646458625793
Epoch 6/20, Loss: 0.5752211213111877
Epoch 7/20, Loss: 0.567899227142334
Epoch 8/20, Loss: 0.5616247057914734
Epoch 9/20, Loss: 0.5561909675598145
Epoch 10/20, Loss: 0.5514596700668335
Epoch 11/20, Loss: 0.5473267436027527
Epoch 12/20, Loss: 0.5437079668045044
Epoch 13/20, Loss: 0.5405330061912537
Epoch 14/20, Loss: 0.5377424359321594
Epoch 15/20, Loss: 0.535285234451294
Epoch 16/20, Loss: 0.5331177115440369
Epoch 17/20, Loss: 0.5312021970748901
Epoch 18/20, Loss: 0.529506504535675
Epoch 19/20, Loss: 0.5280026197433472
Epoch 20/20, Loss: 0.5266665816307068


NOTE: nn.Linear module initializes the parameters slightly
 differently: it uses a uniform random distribution from
−√2
 4 to
+√2
 4
 for both the weights and the bias term (we will
 discuss initialization method

In [None]:
# make predications
X_new = X_test[:3]
with torch.no_grad():
    y_pred = model(X_new)

y_pred

tensor([[0.8226],
        [1.6903],
        [2.6812]])

### Implementing a Regression MLP

In [None]:
import torch.nn as nn

torch.manual_seed(42)

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

In [None]:
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()
train_model(model, optimizer, mse, X_train, y_train, n_epochs)


Epoch 1/20, Loss: 4.974284648895264
Epoch 2/20, Loss: 2.04504132270813
Epoch 3/20, Loss: 1.0027456283569336
Epoch 4/20, Loss: 0.8672141432762146
Epoch 5/20, Loss: 0.7867145538330078
Epoch 6/20, Loss: 0.7360551953315735
Epoch 7/20, Loss: 0.7031640410423279
Epoch 8/20, Loss: 0.6811584234237671
Epoch 9/20, Loss: 0.6656913757324219
Epoch 10/20, Loss: 0.6541036367416382
Epoch 11/20, Loss: 0.6448375582695007
Epoch 12/20, Loss: 0.6369345784187317
Epoch 13/20, Loss: 0.6298699975013733
Epoch 14/20, Loss: 0.6233502626419067
Epoch 15/20, Loss: 0.6172212362289429
Epoch 16/20, Loss: 0.6113643050193787
Epoch 17/20, Loss: 0.6057479381561279
Epoch 18/20, Loss: 0.6003261208534241
Epoch 19/20, Loss: 0.5950582027435303
Epoch 20/20, Loss: 0.5899455547332764


In [None]:
with torch.no_grad():
    y_pred = model(X_new)

y_pred

tensor([[0.9459],
        [1.1934],
        [2.5791]])

### Implement Mini Batch Gradient Descent using DataLoader

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)
model = model.to(device)

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def train(model, optimizer_arg, criterion, train_loader, n_epochs):
  model.train()
  for epoch in range(n_epochs):
    total_loss = 0.
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.to(device)
      y_batch = y_batch.to(device)

      y_pred = model(X_batch)
      loss = criterion(y_pred, y_batch)

      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()


    mean_loss = total_loss / len(train_loader)
    # Fixed the f-string formatting for loss
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {mean_loss:.4f}")

In [None]:
train(model, optimizer, mse, train_loader, n_epochs)

Epoch 1/20, Loss: 0.5313
Epoch 2/20, Loss: 0.4240
Epoch 3/20, Loss: 0.4293
Epoch 4/20, Loss: 0.3916
Epoch 5/20, Loss: 0.3756
Epoch 6/20, Loss: 0.3721
Epoch 7/20, Loss: 0.3458
Epoch 8/20, Loss: 0.3423
Epoch 9/20, Loss: 0.3329
Epoch 10/20, Loss: 0.3306
Epoch 11/20, Loss: 0.3225
Epoch 12/20, Loss: 0.3207
Epoch 13/20, Loss: 0.3124
Epoch 14/20, Loss: 0.3119
Epoch 15/20, Loss: 0.3077
Epoch 16/20, Loss: 0.3027
Epoch 17/20, Loss: 0.3022
Epoch 18/20, Loss: 0.3026
Epoch 19/20, Loss: 0.2977
Epoch 20/20, Loss: 0.2949


In [None]:
# make the predictions
with torch.no_grad():
  y_pred = model(X_new.to(device))

y_pred

tensor([[0.4416],
        [1.6804],
        [4.7236]], device='cuda:0')

### Model Evaluation

METHOD 1: Implement by low level API

In [None]:
def evaluate(model, data_loader, metric_fn, aggregate_fn= torch.mean):
  model.eval()
  metrics = []

  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch = X_batch.to(device)
      y_batch = y_batch.to(device)

      y_pred = model(X_batch)
      metric = metric_fn(y_pred, y_batch)

      metrics.append(metric)

  return aggregate_fn(torch.stack(metrics))


In [None]:
valid_dataset = TensorDataset(X_valid,y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32)

valid_mse = evaluate(model, valid_loader, metric_fn=mse)
valid_mse

tensor(0.3180, device='cuda:0')

In [None]:
def rmse(y_pred, y_true):
  return ((y_pred - y_true)**2).mean().sqrt()

evaluate(model, valid_loader, rmse)

tensor(0.5518, device='cuda:0')

In [None]:
valid_mse.sqrt()

tensor(0.5639, device='cuda:0')

why it is different?
The reason is that instead of calculating the RMSE over the
 whole validation set, we computed it over each batch and
 then computed the mean of all these batch RMSEs. That’s
 not mathematically equivalent to computing the RMSE over
 the whole validation set. To solve this, we can use the MSE
 as our metric_fn, and use the aggregate_fn to compute
 the square root of the mean MSE: 10

In [None]:
evaluate(model, valid_loader, metric_fn=mse, aggregate_fn= lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.5639, device='cuda:0')

METHOD 2: Implement by torchmetrics

In [None]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
  model.eval()
  metric.reset()

  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      X_batch = X_batch.to(device)
      y_batch = y_batch.to(device)

      y_pred =model(X_batch)
      metric.update(y_pred, y_batch)

  return metric.compute()


In [None]:
rmse = torchmetrics.MeanSquaredError(squared=False).to(device)
evaluate_tm(model, valid_loader, rmse)

tensor(0.5639, device='cuda:0')

# Nonsequential Models Wide & Deep Neural Network

In [None]:
class WideAndDeep(nn.Module):
    def __init__(self, n_features):
        self.deep_stack = nn.Sequential(
            nn.Linear(n_features, 50), nn.ReLU(),
            nn.Linear(50, 40), nn.ReLU()
        )
        self.output_layer = nn.Linear(40+n_features, 1)

    def forward(self, X):
        deep_output = self.deep_stack(X)
        wide_and_deep = torch.concat([X, deep_output], dim=1)
        return self.output_layer(wide_and_deep)

Wide and Deep to send subset of features

In [None]:
class WideAndDeepV2(nn.Module):
    def __init__(self, n_features):
        self.deep_stack = nn.Sequential(
            nn.Linear(n_features, 50), nn.ReLU(),
            nn.Linear(50, 40), nn.ReLU()
        )

        self.output_layer() = nn.Linear(40 + n_features, 1)
    
    def forward(self, X):

        X_wide = X[:, 5:]
        X_deep = X[:, :2]

        deep_output = self.deep_stack(X_deep)
        wide_and_deep = torch.concat([X_wide, deep_output], dim=1)

        return self.output_layer(wide_and_deep)

Deep and Wide Model with multiple inputs

In [None]:
class WideAndDeepV3(nn.Module):
    def __init__(self, n_features):
        self.deep_stack = nn.Sequential(
            nn.Linear(n_features, 50), nn.ReLU(),
            nn.Linear(50, 40), nn.ReLU()
        )

        self.output_layer() = nn.Linear(40 + n_features, 1)
    
    def forward(self, X_wide, X_deep):

        deep_output = self.deep_stack(X_deep)
        wide_and_deep = torch.concat([X_wide, deep_output], dim=1)

        return self.output_layer(wide_and_deep)

Giving the Data: Method 1

In [None]:
train_data_wd = TensorDataset(X_train[:,:5], X_train[:,2:], y_train)
train_loader_wd = DataLoader(train_data_wd, batch_size=32, shuffle=True)
# rest same for the validation set and test set

In [None]:
for X_batch_wide, X_batch_deep, y_batch in train_loader_wd:
 X_batch_wide = X_batch_wide.to(device)
 X_batch_deep = X_batch_deep.to(device)
 y_batch = y_batch.to(device)
 y_pred = model(X_batch_wide, X_batch_deep)
# [...]  # the rest of the function is unchanged

In [None]:
for *X_batch_inputs, y_batch in train_loader_wd:
 X_batch_inputs = [X.to(device) for X in X_batch_inputs]
 y_batch = y_batch.to(device)
 y_pred = model(*X_batch_inputs)
# [...]  # the rest of the function is unchanged

Method 2

In [None]:
class WideAndDeepDataset(torch.utils.data.Dataset):
    def __init__(self, X_wide, X_deep, y):
        self.X_wide = X_wide
        self.X_deep = X_deep
        self.y = y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        input_dict = {"X_wide":self.X_wide[idx], "X_deep": self.X_deep[idx]}

        return input_dict, self.y[idx]
        

In [None]:
train_data_named = WideAndDeepDataset(X_wide= X_train[:, :5], X_deep= X_train[:, 2:], y=y_train)
train_loader_named = DataLoader(train_data_named, batch_size=32, shuffle=True)
# same for validation and test


In [None]:
for inputs, y_batch in train_loader_named:
    inputs = {name:X.to(device) for name, X in inputs.items()}
    y_batch = y_batch.to(device)
    y_pred = model(X_wide=inputs['X_wide'], X_deep=inputs['X_deep'])
    # [...]  # the rest of the function is unchanged