## IDRIS

# What is CUDA?

CUDA (Compute Unified Device Architecture) is a parallel computing platform and application programming interface (API) model created by NVIDIA. It enables developers to harness the computational power of NVIDIA GPUs (Graphics Processing Units) for general-purpose processing tasks beyond graphics rendering.

# How do I know if I have CUDA available?

In [None]:
import torch
torch.cuda.is_available()

True

# Using GPU and CUDA


In [None]:
## Get Id of default device
torch.cuda.current_device()

0

In [None]:
# 0
torch.cuda.get_device_name(0) # Get name device with ID '0'

'Tesla T4'

In [None]:
# Returns the current GPU memory usage by
# tensors in bytes for a given device
torch.cuda.memory_allocated()

17107456

In [None]:
# Returns the current GPU memory managed by the
# caching allocator in bytes for a given device
torch.cuda.memory_cached()



23068672

# Using CUDA instead of CPU

In [None]:
# CPU
a = torch.FloatTensor([1.,2.])

In [None]:
a

tensor([1., 2.])

In [None]:
a.device

device(type='cpu')

In [None]:
# GPU
a = torch.FloatTensor([1., 2.]).cuda()

In [None]:
a.device

device(type='cuda', index=0)

In [None]:
torch.cuda.memory_allocated()

17107456

## Defining our Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
class Model(nn.Module):
    def __init__(self, in_features=4, h1=8, h2=9, out_features=3):
        super().__init__()
        self.fc1 = nn.Linear(in_features,h1)    # input layer
        self.fc2 = nn.Linear(h1, h2)            # hidden layer
        self.out = nn.Linear(h2, out_features)  # output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.out(x)
        return x

In [None]:
model = Model()

In [None]:
# Checking if our model is on cuda or not.
next(model.parameters()).is_cuda

False

Will be using a very simple MNIST dataset just to demonstrate between CPU and GPU.

In [None]:
transform = transforms.ToTensor()

In [None]:
train_data = datasets.MNIST(root='../Data', train=True, download=True, transform= transform)
test_data = datasets.MNIST(root='../Data', train=False, download=True, transform=transform)

In [None]:
train_loader = DataLoader(train_data, batch_size= 10, shuffle=True)
test_loader = DataLoader(test_data, batch_size= 10, shuffle=False)

  # Defining model

In [None]:
class ConvolutionalNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 3, 1)
        self.conv2 = nn.Conv2d(6, 16, 3, 1)
        self.fc1 = nn.Linear(5*5*16, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84,10)

    def forward(self, X):
        X = F.relu(self.conv1(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv2(X))
        X = F.max_pool2d(X, 2, 2)
        X = X.view(-1, 5*5*16)
        X = F.relu(self.fc1(X))
        X = F.relu(self.fc2(X))
        X = self.fc3(X)
        return F.log_softmax(X, dim=1)

In [None]:
model = ConvolutionalNetwork()
model

ConvolutionalNetwork(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [None]:
# Checking if our model is on cuda or not.
next(model.parameters()).is_cuda

False

In [None]:
# Defining loss and criterion
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training our Model

In [None]:
import time

start_time = time.time()

epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []

for i in range(epochs):
    trn_corr = 0
    tst_corr = 0

    # Run the training batches
    for b, (X_train, y_train) in enumerate(train_loader):
        b += 1


        # Apply the model
        y_pred = model(X_train)  # Apply the model on GPU
        loss = criterion(y_pred, y_train)

        # Tally the number of correct predictions
        predicted = torch.max(y_pred, 1)[1]  # No need for .data here
        batch_corr = (predicted == y_train).sum().item()  # Get the number of correct predictions
        trn_corr += batch_corr

        # Update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print interim results
        if b % 600 == 0:
            print(f'epoch: {i+1:2}  batch: {b:4} [{10 * b:6}/60000]  loss: {loss.item():10.8f}  accuracy: {trn_corr * 100 / (10 * b):7.3f}%')

    train_losses.append(loss)
    train_correct.append(trn_corr)

    # Run the testing batches
    with torch.no_grad():
        for b, (X_test, y_test) in enumerate(test_loader):

            # Apply the model
            y_val = model(X_test)

            # Tally the number of correct predictions
            predicted = torch.max(y_val, 1)[1]
            tst_corr += (predicted == y_test).sum().item()

    # Calculate the test loss after processing all test batches
    loss = criterion(y_val, y_test)
    test_losses.append(loss)
    test_correct.append(tst_corr)

print(f'\nDuration: {time.time() - start_time:.0f} seconds')  # Print the time elapsed


epoch:  1  batch:  600 [  6000/60000]  loss: 0.37270087  accuracy:  73.500%
epoch:  1  batch: 1200 [ 12000/60000]  loss: 0.15206459  accuracy:  82.833%
epoch:  1  batch: 1800 [ 18000/60000]  loss: 0.01752903  accuracy:  86.661%
epoch:  1  batch: 2400 [ 24000/60000]  loss: 0.05838161  accuracy:  88.821%
epoch:  1  batch: 3000 [ 30000/60000]  loss: 0.02061462  accuracy:  90.173%
epoch:  1  batch: 3600 [ 36000/60000]  loss: 0.01586456  accuracy:  91.228%
epoch:  1  batch: 4200 [ 42000/60000]  loss: 0.01075920  accuracy:  92.019%
epoch:  1  batch: 4800 [ 48000/60000]  loss: 0.13801089  accuracy:  92.627%
epoch:  1  batch: 5400 [ 54000/60000]  loss: 0.01024230  accuracy:  93.133%
epoch:  1  batch: 6000 [ 60000/60000]  loss: 0.13038364  accuracy:  93.560%
epoch:  2  batch:  600 [  6000/60000]  loss: 0.03631741  accuracy:  97.717%
epoch:  2  batch: 1200 [ 12000/60000]  loss: 0.01364149  accuracy:  97.675%
epoch:  2  batch: 1800 [ 18000/60000]  loss: 0.02211459  accuracy:  97.678%
epoch:  2  b

# Now using CUDA on MNIST

In [None]:
gpumodel = ConvolutionalNetwork().cuda()

In [None]:
# Checking if our model is on cuda or not.
next(gpumodel.parameters()).is_cuda

True

In [None]:
# Defining loss and criterion
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gpumodel.parameters(), lr=0.001)

In [None]:
import time

start_time = time.time()

epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []

for i in range(epochs):
    trn_corr = 0
    tst_corr = 0

    # Run the training batches
    for b, (X_train, y_train) in enumerate(train_loader):
        b += 1

        # Move data to GPU
        X_train = X_train.cuda()
        y_train = y_train.cuda()

        # Apply the model
        y_pred = gpumodel(X_train)  # Apply the model on GPU
        loss = criterion(y_pred, y_train)

        # Tally the number of correct predictions
        predicted = torch.max(y_pred, 1)[1]  # No need for .data here
        batch_corr = (predicted == y_train).sum().item()  # Get the number of correct predictions
        trn_corr += batch_corr

        # Update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print interim results
        if b % 600 == 0:
            print(f'epoch: {i+1:2}  batch: {b:4} [{10 * b:6}/60000]  loss: {loss.item():10.8f}  accuracy: {trn_corr * 100 / (10 * b):7.3f}%')

    train_losses.append(loss)
    train_correct.append(trn_corr)

    # Run the testing batches
    with torch.no_grad():
        for b, (X_test, y_test) in enumerate(test_loader):
            # Move data to GPU
            X_test = X_test.cuda()
            y_test = y_test.cuda()

            # Apply the model
            y_val = gpumodel(X_test)

            # Tally the number of correct predictions
            predicted = torch.max(y_val, 1)[1]
            tst_corr += (predicted == y_test).sum().item()

    # Calculate the test loss after processing all test batches
    loss = criterion(y_val, y_test)
    test_losses.append(loss)
    test_correct.append(tst_corr)

print(f'\nDuration: {time.time() - start_time:.0f} seconds')  # Print the time elapsed


epoch:  1  batch:  600 [  6000/60000]  loss: 0.02661047  accuracy:  80.267%
epoch:  1  batch: 1200 [ 12000/60000]  loss: 0.03233250  accuracy:  87.067%
epoch:  1  batch: 1800 [ 18000/60000]  loss: 0.07894491  accuracy:  89.961%
epoch:  1  batch: 2400 [ 24000/60000]  loss: 0.02530557  accuracy:  91.504%
epoch:  1  batch: 3000 [ 30000/60000]  loss: 0.23879036  accuracy:  92.460%
epoch:  1  batch: 3600 [ 36000/60000]  loss: 0.00357001  accuracy:  93.189%
epoch:  1  batch: 4200 [ 42000/60000]  loss: 0.03443167  accuracy:  93.762%
epoch:  1  batch: 4800 [ 48000/60000]  loss: 0.22139096  accuracy:  94.188%
epoch:  1  batch: 5400 [ 54000/60000]  loss: 0.01990363  accuracy:  94.594%
epoch:  1  batch: 6000 [ 60000/60000]  loss: 0.02439185  accuracy:  94.882%
epoch:  2  batch:  600 [  6000/60000]  loss: 0.10189235  accuracy:  97.983%
epoch:  2  batch: 1200 [ 12000/60000]  loss: 0.00636857  accuracy:  97.900%
epoch:  2  batch: 1800 [ 18000/60000]  loss: 0.00662791  accuracy:  97.950%
epoch:  2  b

Summary:
In this demonstration, we compared the performance of training a deep learning model on CPU and CUDA (GPU). The CPU execution took approximately 67 seconds, while the CUDA execution took approximately 47 seconds. This highlights the significant speedup achieved by utilizing the GPU for training deep learning models.

# End