# Script Mode in SageMaker
In this page, we will see how to use script mode in SageMaker to submit a training job.
To do this we will need two files: our training script (present in `pytorch_mnist.py`) and the submission script (this jupyter notebook).

First we need to import the things we will need:
- `Pytorch`: This is the estimator we will use.
- `get_execution_role`: We will need this to get the role when initialising our estimator

## `pytorch_mnist.py`
<details>
  <summary> Click here to see the full script code </summary>
   
``` python
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


def train(model, train_loader, optimizer, epoch):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                )
            )


def test(model, test_loader):
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=14,
        metavar="N",
        help="number of epochs to train (default: 14)",
    )
    parser.add_argument(
        "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)"
    )
    args = parser.parse_args()

    train_kwargs = {"batch_size": args.batch_size}
    test_kwargs = {"batch_size": args.test_batch_size}

    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )
    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net()

    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    for epoch in range(1, args.epochs + 1):
        train(model, train_loader, optimizer, epoch)
        test(model, test_loader)
    
    torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == "__main__":
    main()
```
</details>

In [None]:
!pip install sagemaker -U

In [2]:
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Next we will create a dictionary containing our hyperparameters. These hyperparameters need to be taken as command-line arguments in your training script.

In [3]:
hyperparameters = {"epochs": "2", "batch-size": "32", "test-batch-size": "100", "lr": "0.001"}

Next we will initialise our estimator. Here we need to specify the:
- `entry_point`: The path of the training script
- `base_job_name`: The name of the job
- `instance_type`: The type of training instance you want to use
- `instance_count`: The number of training instances to use
- `framework_version`: The version of pytorch you want in your training instance
- `py_version`: The version of Python you want in your training instance

In [4]:
estimator = PyTorch(
    entry_point="pytorch_mnist.py",
    base_job_name="sagemaker-script-mode",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    hyperparameters=hyperparameters,
    framework_version="1.8",
    py_version="py36",
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Calling the `fit()` method will start training. By setting `wait=True`, this line will be blocking, meaning that the rest of the code will not run until the training job finishes.

In [5]:
estimator.fit(wait=True)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: sagemaker-script-mode-2024-01-29-22-16-55-535


2024-01-29 22:16:56 Starting - Starting the training job...
2024-01-29 22:17:12 Starting - Preparing the instances for training.........
2024-01-29 22:18:30 Downloading - Downloading input data...
2024-01-29 22:19:10 Downloading - Downloading the training image......
2024-01-29 22:20:10 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-29 22:20:14,499 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-29 22:20:14,502 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-29 22:20:14,511 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-29 22:20:14,516 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-01-29 22:20:14,708 sagemaker-trai

Next you can see some of the details of the training like the job name, the hyperparameters used for training and the location where the trained model is saved.

In [6]:
estimator.base_job_name

'sagemaker-script-mode'

In [7]:
estimator.hyperparameters()

{'epochs': '"2"',
 'batch-size': '"32"',
 'test-batch-size': '"100"',
 'lr': '"0.001"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-689833681399/sagemaker-script-mode-2024-01-29-22-16-55-535/source/sourcedir.tar.gz"',
 'sagemaker_program': '"pytorch_mnist.py"',
 'sagemaker_container_log_level': '20',
 'sagemaker_job_name': '"sagemaker-script-mode-2024-01-29-22-16-55-535"',
 'sagemaker_region': '"us-east-1"'}

In [8]:
estimator.model_data

's3://sagemaker-us-east-1-689833681399/sagemaker-script-mode-2024-01-29-22-16-55-535/output/model.tar.gz'