In [1]:
import torch

import mlflow

import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

from baseline import BaselineModel

In [2]:
mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment('cifar10-classification')

<Experiment: artifact_location='s3://mlflow-aidkr-test-bucket/1', experiment_id='1', lifecycle_stage='active', name='cifar10-classification', tags={}>

In [3]:
batch_size=4
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))])
train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)

validation_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [4]:
baseline_model = BaselineModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
baseline_model.to(device)

BaselineModel(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [5]:
n_epochs = 3
lr = 0.001
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(baseline_model.parameters(), lr=lr)

In [6]:
mlflow.log_params({
    'n_epochs':n_epochs,
    "learning_rate":lr,
    "training_set": len(train_loader),
    "validation_set": len(validation_loader)
})

In [7]:
loss_p = np.array([])
accuracy_p = np.array([])
for epoch in range(n_epochs):
    
    #training
    baseline_model.train()
    total_image = 0
    correct_image = 0
    running_loss = 0
    for i, data in enumerate(train_loader):
        image, label = data
        optimizer.zero_grad()
        
        output = baseline_model(image)
        __, predicts = torch.max(output, 1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        
        total_image += label.size(0)
        correct_image += (predicts == label).sum().item()
        running_loss += loss.item()
        if i%100==0:
            print('batch:{}/{}, accuracy:{}'.format(i, len(train_loader), correct_image/total_image*100), end='\r')
    print('Epoch:{}, loss:{}, accuracy:{}'.format(
        epoch+1, running_loss/len(train_loader), correct_image/total_image*100
    ))
    loss_p=np.append(loss_p, running_loss/len(train_loader))
    accuracy_p=np.append(accuracy_p, correct_image/total_image*100)
    mlflow.log_metric("train_loss", running_loss/len(train_loader))
    mlflow.log_metric("train_accuracy", correct_image/total_image*100)
    
    #validataion
    baseline_model.eval()
    validation_loss = 0
    validation_total_image = 0
    validation_correct_image = 0
    for i, data in enumerate(validation_loader):
        validation_image, validation_label = data
        with torch.no_grad():
            output = baseline_model(image)
            __, predict = torch.max(output, 1)
            loss = criterion(output, validation_label)
            
            validation_total_image += validation_label.size(0)
            validation_correct_image += (predict == validation_label).sum().item()
            validation_loss += loss
            
    mlflow.log_metric("validation_loss", validation_loss/len(validation_loader))
    mlflow.log_metric("validation_accuracy", validation_correct_image/validation_total_image*100)

Epoch:1, loss:1.5476470832657814, accuracy:43.406
Epoch:2, loss:1.2801852732798458, accuracy:54.454
Epoch:3, loss:1.1755890388795733, accuracy:58.592


In [14]:
mlflow.pytorch.log_model(
    baseline_model, 
    artifact_path='cifar10-classifier', 
    registered_model_name="cifar10-classifier"
)

Successfully registered model 'cifar10-classifier'.
2022/07/11 17:06:11 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: cifar10-classifier, version 1
Created version '1' of model 'cifar10-classifier'.


ModelInfo(artifact_path='cifar10-classifier', flavors={'pytorch': {'model_data': 'data', 'pytorch_version': '1.10.0', 'code': None}, 'python_function': {'pickle_module_name': 'mlflow.pytorch.pickle_module', 'loader_module': 'mlflow.pytorch', 'python_version': '3.8.13', 'data': 'data', 'env': 'conda.yaml'}}, model_uri='runs:/62d41a08277a4e55bdd1db44c13123a3/cifar10-classifier', model_uuid='8a49a4279c664b4fba4ecbead4fabb12', run_id='62d41a08277a4e55bdd1db44c13123a3', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-07-11 08:06:04.107959', mlflow_version='1.27.0')

In [21]:
mlflow.end_run()

17.4