In [4]:
# Import packages
import torch
import os
from torch import nn 
from torch import optim 
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from dataset import train_data, test_data
from residual_block import Residual_block
from resnet18 import ResNet18

In [5]:
# Create DATALoader
train_loader = DataLoader(train_data, batch_size = 16, shuffle = True)
test_loader = DataLoader(test_data, batch_size = 16, shuffle = False)

In [None]:
#For starting MLflow server change the path to folders, copy next commit  and run to cmd:


# mlflow server --backend-store-uri "file:///C:Users/Admin/ML_flow_Tracking/data_local" --default-artifact-root "file:///C:Users/Admin/ML_flow_Tracking/artefacts" --host localhost --port 5000

In [6]:
# Indicate username for registration at the MLflow server
os.environ['USER'] = 'USER'

In [7]:
# for reproducibility of results
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
# Initialization of  MLflow
# import mlflow.experiments
mlflow.set_tracking_uri('http://127.0.0.1:5000/')

# Indicate project name
mlflow.set_experiment('PROJECT_NAME')

In [9]:
# Set off MLflow warnings 
import logging
mlflow_logger = logging.getLogger("mlflow")
mlflow_logger.setLevel(logging.ERROR)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#Set hyperparameter
EPOCH = 10
LR = 0.001
momentum = 0
weight_decay = 0
opt = 'Adam' 
run_name='Exp_1'

In [None]:
# Starting MLflow 

with mlflow.start_run(run_name = run_name) as run:                            # starting experiment with name "run_name"
    net = ResNet18(Residual_block).to(device)
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=LR, 
                          weight_decay = weight_decay)                         #momentum=0.9, weight_decay=5e-4 
    
    mlflow.log_param("momentum", momentum)
    mlflow.log_param("weight_decay", weight_decay)
    mlflow.log_param("lr", LR)
    mlflow.log_param("optimizer", opt)
    mlflow.log_param("epochs", EPOCH )
    
    maxacc = 0
    itr_record = 0

    for epoch in range(EPOCH):
        epoch += 1
        net.train()
        train_loss = 0.0
        test_loss = 0.0
        max_train_acc = 0
        max_test_acc = 0
        correct = 0.0
        train_samples = 0.0
        test_samples = 0.0

        print(f'Началось обучение {epoch} эпохи')
        
        for itr, data in enumerate(train_loader):
            # if itr == 5:                                                      # to stop the model for checking
            #     break
            inputs, labels = data 
            inputs, labels = inputs.to(device), labels.to(device)
           
            outputs = net(inputs)
           
            loss = loss_func(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            train_loss += loss.item() * outputs.size(0)                          # train_loss+= mean_batch_loss * batch_size 
                                                                                 #  Multiplication by outputs.size(0) (batch) is  a transformation of the average
                                                                                 #  LOSS value for the batch into the total.  
            _, predicted = torch.max(outputs.data, 1)                            # _,#  predicted - value tensor, number of index with max value.
                                                                                 # .data #  link two tensors        
            train_samples += outputs.size(0)                                     #  Counts the number of photos.
            correct += predicted.eq(labels.data).cpu().sum()                     # Sums up the number of matching  with labels.    
                        
        train_loss /= train_samples
        train_acc = 100*correct / train_samples                                  # Accuracy  
        
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("train_acc", train_acc, step=epoch)
        print(f'The Epoch {epoch}:')
        print(f'Train loss - {train_loss:.3f}, Train accuracy - {train_acc:.2f} %')

        net.eval()
        
        
        correct = 0
        
        with torch.no_grad():
            for itr, data in enumerate(test_loader):
                # if itr == 5:
                #     break
                inputs, labels = data 
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = net(inputs)
                loss = loss_func(outputs, labels)

                test_loss += loss.item() * outputs.size(0)
                _, predicted = torch.max(outputs.data, 1)                  
                test_samples += outputs.size(0)                            
            
                correct += predicted.eq(labels.data).cpu().sum()

        test_loss /= test_samples
        test_acc = 100*correct / test_samples
       
        mlflow.log_metric("test_loss", test_loss, step=epoch)
        mlflow.log_metric("test_acc", test_acc, step=epoch)
        print(f'Test loss - {test_loss:.3f}, Test accuracy - {test_acc:.2f} %')

        if test_acc > maxacc:
            print('Saving model because its better')
            maxacc = test_acc
            mlflow.pytorch.log_model(net, "MODEL_NAME")                                          # Indicate model name "MODEL_NAME"
        print('-------')

    print(f'Max accuracy - {maxacc:.2f} %')
    mlflow.log_metric("max test accuracy", maxacc)

mlflow.end_run()