In [2]:
! pip install bentoml
! pip install mlflow # mlflow
! pip install pyngrok # 로컬 개발환경을 원격으로 공유해주는 라이브러리



In [3]:
import torch.nn as nn
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
import torch.optim as optim
import mlflow # mlflow 사용을 위해

# Model 정의

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
    def forward(self, x):
        x1 = self.fc1(x)
        x2 = self.relu(x1)
        x3 = self.fc2(x2)
        x4 = self.relu(x3)
        x5 = self.fc3(x4)

        return x5


# Dataset 정의  
## MNIST Dataset을 사용하여 학습,검증을 합니다.

In [5]:
download_root = 'MNIST_data/'

train_dataset = datasets.MNIST(root=download_root,
                         train=True,
                         transform = transforms.ToTensor(),
                         download=True)
                         
test_dataset = datasets.MNIST(root=download_root,
                         train=False,
                         transform=transforms.ToTensor(),
                         download=True)    

# Batch_size, Train, Test Dataloader 정의

In [6]:
batch_size = 100
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


# 학습률, Optimizer 정의

In [7]:
model = Net()
model.zero_grad()
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


# Epoch 정의 (10번)


In [8]:
total_batch = len(train_loader)
epochs = 10
print(len(train_loader))

600


# MLflow, Ngrok 환경설정

In [9]:
experiment_name = 'mnist_practice' # 실험명, 실험관리를 용이하게 해줍니다. 


if not mlflow.get_experiment_by_name(experiment_name): 
  mlflow.create_experiment(name=experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)

get_ipython().system_raw("mlflow ui --port 5000 &") # mlflow 포트 지정 및 백그라운드 실행을 위한 shell 코드.

from pyngrok import ngrok

ngrok.kill() # 혹시 열려있는 프로세스가 있으면 kill해줍니다.

NGROK_AUTH_TOKEN = "" # 개인 토큰입니다.https://dashboard.ngrok.com/get-started/your-authtoken 를 통해서 받아올 수 있습니다. 
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

ngrok_tunnel = ngrok.connect(addr="5000",proto="http",bind_tls=True) # tunnel을 설정해주는 코드입니다. mlflow의경우에는 5000번 포트를 사용합니다.
print("MLflow Tracking UI:", ngrok_tunnel.public_url) # 이 코드가 실행되고 나오는 url을 통해 MLflow에 접속할 수 있습니다.

MLflow Tracking UI: https://264b-220-72-63-26.ngrok.io


[2022-10-30 22:01:32 +0900] [5086] [INFO] Starting gunicorn 20.1.0
[2022-10-30 22:01:32 +0900] [5086] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2022-10-30 22:01:32 +0900] [5086] [ERROR] Retrying in 1 second.
[2022-10-30 22:01:33 +0900] [5086] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2022-10-30 22:01:33 +0900] [5086] [ERROR] Retrying in 1 second.
[2022-10-30 22:01:34 +0900] [5086] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2022-10-30 22:01:34 +0900] [5086] [ERROR] Retrying in 1 second.
[2022-10-30 22:01:35 +0900] [5086] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2022-10-30 22:01:35 +0900] [5086] [ERROR] Retrying in 1 second.
[2022-10-30 22:01:36 +0900] [5086] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2022-10-30 22:01:36 +0900] [5086] [ERROR] Retrying in 1 second.
[2022-10-30 22:01:37 +0900] [5086] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see the logs above for details.


# Seed 고정

In [10]:
import torch.backends.cudnn as cudnn
import random
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)

# Train, Track
## 학습을 진행하고 MLflow로 학습을 기록하는 부분입니다.

In [12]:
train_loss_list = []
train_acc_list = []

val_loss_list = []
val_acc_list = []

best_accuracy = 0
with mlflow.start_run(experiment_id=experiment.experiment_id):
  for epoch in range(epochs):
      cost=0
      model.train()
      train_accuracy = 0
      train_loss = 0
      for images, labels in train_loader:
          images = images.reshape(100,784)
          
          optimizer.zero_grad() # 변화도 매개변수 0
          
          #forward
          #pred = model.forward(images)
          #loss = loss_function(pred, labels)
          pred = model(images)
          loss = loss_function(pred,labels)
          prediction = torch.argmax(pred,1)
          correct = (prediction == labels)
          train_accuracy += correct.sum().item() / 60000
          train_loss += loss.item() / 600
          
          #backward
          loss.backward()
          
          #Update
          optimizer.step()
          
          cost += loss
      
      with torch.no_grad(): #미분하지 않겠다는 것
          total = 0
          correct=0
          for images, labels in test_loader:
              images = images.reshape(100,784)

              outputs = model(images)
              _,predict = torch.max(outputs.data, 1)

              total += labels.size(0)
              correct += (predict==labels).sum() # 예측한 값과 일치한 값의 합

      avg_cost = cost / total_batch
      accuracy = 100*correct/total
      
      val_loss_list.append(avg_cost.detach().numpy())
      val_acc_list.append(accuracy)

      if accuracy > best_accuracy:
        torch.save(model.state_dict(),'model.pt')
        best_accuracy = accuracy
        print(f"Save Model(Epoch: {epoch+1}, Accuracy: {best_accuracy:.5})")
      
      print("epoch : {} | loss : {:.6f}" .format(epoch+1, avg_cost))
      print("Accuracy : {:.2f}".format(100*correct/total))
      mlflow.log_param('learning-rate',learning_rate) # mlflow.log_param 을 사용하여 MLflow에 파라미터들을 기록할 수 있습니다.
      mlflow.log_param('epoch',epochs)
      mlflow.log_param('batch_size',batch_size)
      mlflow.log_param('seed',seed)
      mlflow.log_metric('train_accuracy',train_accuracy) # mlflow.log_metric을 사용하여 MLflow에 성능평가를 위한 metric을 기록할 수 있습니다.
      mlflow.log_metric('train_loss',train_loss)
      mlflow.log_metric('valid_accuracy',accuracy)
      mlflow.log_metric('valid_loss',avg_cost)
      mlflow.pytorch.log_model(model,'model') # pytorch.log_model 을 통해 모델을 저장할 수 있습니다.
      print("------")

Save Model(Epoch: 1, Accuracy: 84.64)
epoch : 1 | loss : 0.931096
Accuracy : 84.64


PermissionError: [Errno 13] Permission denied: '/Users/gimgihun'

In [17]:
import os
import json
import bentoml
import numpy as np
import pandas as pd
from torch import cuda

In [24]:
MLFLOW_PATH = './model_mlflow'

if not os.path.isdir(MLFLOW_PATH):
  mlflow.pytorch.save_model(model, MLFLOW_PATH) # mlflow 모델 로컬 디렉토리에 저장

bentoml.mlflow.import_model("mnist_clf", model_uri='./model_mlflow') # mlflow로 저장한 모델을 bentoml format에 맞추어 가져옴

Model(tag="mnist_clf:6hkb6hsykwlipatm", path="/Users/hiseoung/bentoml/models/mnist_clf/6hkb6hsykwlipatm/")

In [30]:
!bentoml models list mnist_clf # 현재 등록되어 있는 모델 리스트 출력

[1m [0m[1mTag                       [0m[1m [0m[1m [0m[1mModule        [0m[1m [0m[1m [0m[1mSize      [0m[1m [0m[1m [0m[1mCreation Time      [0m[1m [0m
 mnist_clf:6hkb6hsykwlipatm  bentoml.mlflow  355.30 KiB  2022-10-30 22:22:47 


In [26]:
# mnist 테스트 데이터 가져오기
with open('./test_input_arr.json', 'r') as f:
  test_input_arr = np.array(json.load(f), dtype=np.float32)

In [1]:
# API 테스트
runner = bentoml.mlflow.get("mnist_clf:latest").to_runner()
runner.init_local()
runner.predict.run(test_input_arr)

NameError: name 'bentoml' is not defined

In [35]:
!bentoml serve service:svc --reload

2022-10-30T22:39:32+0900 [INFO] [cli] Prometheus metrics for HTTP BentoServer from "service:svc" can be accessed at http://localhost:3000/metrics.
2022-10-30T22:39:33+0900 [INFO] [cli] Starting development HTTP BentoServer from "service:svc" running on http://0.0.0.0:3000 (Press CTRL+C to quit)
2022-10-30 22:39:34 circus[7139] [INFO] Loading the plugin...
2022-10-30 22:39:34 circus[7139] [INFO] Endpoint: 'tcp://127.0.0.1:54140'
2022-10-30 22:39:34 circus[7139] [INFO] Pub/sub: 'tcp://127.0.0.1:54141'
2022-10-30T22:39:34+0900 [INFO] [observer] Watching directories: ['/Users/hiseoung/VSCodeProjects/AIP', '/Users/hiseoung/bentoml/models']
2022-10-30T22:39:35+0900 [ERROR] [cli] Exception in callback <bound method Arbiter.manage_watchers of <circus.arbiter.Arbiter object at 0x119497790>>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/gomgomi/lib/python3.8/site-packages/tornado/ioloop.py", line 921, in _run
    val = self.callback()
  File "/opt/homebrew