# Getting started with PyTorch on Intel® Gaudi.

This notebook is to help you get started quickly using the Intel® Gaudi accelerator in this container. A simple MNIST model is trained on the Gaudi acclerator. You can tune some of the parameters below to change configuration of the training. For more information and reference please refer to the official documentation of [Intel® Gaudi acclerator](https://docs.habana.ai/en/latest/index.html).

### Setup modes for training

1. lazy_mode: Set to True(False) to enable(disable) lazy mode.
2. enable_amp: Set to True(False) to enable Automatic Mixed Precision.
3. epochs: Number of epochs for training
4. lr: Learning rate for training
5. batch_size: Number of samples in a batch
6. milestones: Milestone epochs for the stepLR scheduler.

In [None]:
lazy_mode = False
enable_amp = False
epochs = 20
batch_size = 128
lr = 0.01
milestones = [10,15]

In [None]:
import os
os.environ['HABANA_LOGS']='/opt/app-root/logs'
if lazy_mode:
    os.environ['PT_HPU_LAZY_MODE'] = '1'
else:
    os.environ['PT_HPU_LAZY_MODE'] = '0'

### Import packages

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import os

# Import Habana Torch Library
import habana_frameworks.torch.core as htcore

### Define Model

In [None]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()

        self.fc1   = nn.Linear(784, 256)
        self.fc2   = nn.Linear(256, 64)
        self.fc3   = nn.Linear(64, 10)

    def forward(self, x):

        out = x.view(-1,28*28)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)

        return out

### Define training routine

In [None]:
def train(net,criterion,optimizer,trainloader,device):

    net.train()
    if not lazy_mode:
        net = torch.compile(net,backend="hpu_backend")
    train_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, targets) in enumerate(trainloader):

        data, targets = data.to(device), targets.to(device)

        optimizer.zero_grad()
        if enable_amp:
            with torch.autocast(device_type="hpu", dtype=torch.bfloat16):
                outputs = net(data)
                loss = criterion(outputs, targets)
        else:
            outputs = net(data)
            loss = criterion(outputs, targets)

        loss.backward()
        
        # API call to trigger execution
        if lazy_mode:
            htcore.mark_step()
        
        optimizer.step()

        # API call to trigger execution
        if lazy_mode:
            htcore.mark_step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss = train_loss/(batch_idx+1)
    train_acc = 100.0*(correct/total)
    print("Training loss is {} and training accuracy is {}".format(train_loss,train_acc))

### Define testing routine

In [None]:
def test(net,criterion,testloader,device):

    net.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():

        for batch_idx, (data, targets) in enumerate(testloader):

            data, targets = data.to(device), targets.to(device)
            
            if enable_amp:
                with torch.autocast(device_type="hpu", dtype=torch.bfloat16):
                    outputs = net(data)
                    loss = criterion(outputs, targets)
            else:
                outputs = net(data)
                loss = criterion(outputs, targets)


            # API call to trigger execution
            if lazy_mode:
                htcore.mark_step()

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    test_loss = test_loss/(batch_idx+1)
    test_acc = 100.0*(correct/total)
    print("Testing loss is {} and testing accuracy is {}".format(test_loss,test_acc))

### Run the main routine to train and test the model

In [None]:
load_path = './data'
save_path = './checkpoints'

if(not os.path.exists(save_path)):
    os.makedirs(save_path)

# Target the Gaudi HPU device
device = torch.device("hpu")

# Data
transform = transforms.Compose([
    transforms.ToTensor(),
])

trainset = torchvision.datasets.MNIST(root=load_path, train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                        shuffle=True, num_workers=2)
testset = torchvision.datasets.MNIST(root=load_path, train=False,
                                    download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                        shuffle=False, num_workers=2)

net = SimpleModel()
net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr,
                    momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

for epoch in range(1, epochs+1):
    print("=====================================================================")
    print("Epoch : {}".format(epoch))
    train(net,criterion,optimizer,trainloader,device)
    test(net,criterion,testloader,device)

    torch.save(net.state_dict(), os.path.join(save_path,'epoch_{}.pth'.format(epoch)))

    scheduler.step()