In [1]:
import torch.nn as nn
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
import torch.optim as optim
import mlflow # mlflow 사용을 위해

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784,100)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100,100)
        self.fc3 = nn.Linear(100,10)
    def forward(self, x):
        x1 = self.fc1(x)
        x2 = self.relu(x1)
        x3 = self.fc2(x2)
        x4 = self.relu(x3)
        x5 = self.fc3(x4)

        return x5

download_root = 'MNIST_data/'

train_dataset = datasets.MNIST(root=download_root,
                         train=True,
                         transform = transforms.ToTensor(),
                         download=True)
                         
test_dataset = datasets.MNIST(root=download_root,
                         train=False,
                         transform=transforms.ToTensor(),
                         download=True)    

batch_size = 100
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

model = Net()
model.zero_grad()
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.02
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

experiment_name = 'mnist' # 실험명, 실험관리를 용이하게 해줍니다. 


if not mlflow.get_experiment_by_name(experiment_name): 
  mlflow.create_experiment(name=experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)

mlflow.set_tracking_uri('http://localhost:5000')
#mlflow.set_tag("mlflow.runName","practice")

import torch.backends.cudnn as cudnn
import random
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(seed)

train_loss_list = []
train_acc_list = []

val_loss_list = []
val_acc_list = []

total_batch = len(train_loader)
epochs = 10


best_accuracy = 0
with mlflow.start_run(experiment_id=experiment.experiment_id,run_name="autoever"):
  for epoch in range(epochs):
      cost=0
      model.train()
      train_accuracy = 0
      train_loss = 0
      for images, labels in train_loader:
          images = images.reshape(100,784)
          
          optimizer.zero_grad() # 변화도 매개변수 0
          
          #forward
          #pred = model.forward(images)
          #loss = loss_function(pred, labels)
          pred = model(images)
          loss = loss_function(pred,labels)
          prediction = torch.argmax(pred,1)
          correct = (prediction == labels)
          train_accuracy += correct.sum().item() / 60000
          train_loss += loss.item() / 600
          
          #backward
          loss.backward()
          
          #Update
          optimizer.step()
          
          cost += loss
      
      with torch.no_grad(): #미분하지 않겠다는 것
          total = 0
          correct=0
          for images, labels in test_loader:
              images = images.reshape(100,784)

              outputs = model(images)
              _,predict = torch.max(outputs.data, 1)

              total += labels.size(0)
              correct += (predict==labels).sum() # 예측한 값과 일치한 값의 합

      avg_cost = cost / total_batch
      accuracy = 100*correct/total
      
      val_loss_list.append(avg_cost.detach().numpy())
      val_acc_list.append(accuracy)

      if accuracy > best_accuracy:
        torch.save(model.state_dict(),'model.pt')
        best_accuracy = accuracy
        print(f"Save Model(Epoch: {epoch+1}, Accuracy: {best_accuracy:.5})")
      
      print("epoch : {} | loss : {:.6f}" .format(epoch+1, avg_cost))
      print("Accuracy : {:.2f}".format(100*correct/total))
      mlflow.log_param('learning-rate',learning_rate) # mlflow.log_param 을 사용하여 MLflow에 파라미터들을 기록할 수 있습니다.
      mlflow.log_param('epoch',epochs)
      mlflow.log_param('batch_size',batch_size)
      mlflow.log_param('seed',seed)
      mlflow.log_metric('train_accuracy',train_accuracy) # mlflow.log_metric을 사용하여 MLflow에 성능평가를 위한 metric을 기록할 수 있습니다.
      mlflow.log_metric('train_loss',train_loss)
      mlflow.log_metric('valid_accuracy',accuracy)
      mlflow.log_metric('valid_loss',avg_cost)
      mlflow.pytorch.log_model(model,'model') # pytorch.log_model 을 통해 모델을 저장할 수 있습니다.
      print("------")
mlflow.end_run()

  from .autonotebook import tqdm as notebook_tqdm


Save Model(Epoch: 1, Accuracy: 82.89)
epoch : 1 | loss : 1.526924
Accuracy : 82.89




------
Save Model(Epoch: 2, Accuracy: 89.38)
epoch : 2 | loss : 0.475359
Accuracy : 89.38
------
Save Model(Epoch: 3, Accuracy: 90.48)
epoch : 3 | loss : 0.359833
Accuracy : 90.48
------
Save Model(Epoch: 4, Accuracy: 91.76)
epoch : 4 | loss : 0.318101
Accuracy : 91.76
------
Save Model(Epoch: 5, Accuracy: 92.22)
epoch : 5 | loss : 0.291083
Accuracy : 92.22
------
Save Model(Epoch: 6, Accuracy: 92.97)
epoch : 6 | loss : 0.269476
Accuracy : 92.97
------
Save Model(Epoch: 7, Accuracy: 93.46)
epoch : 7 | loss : 0.250513
Accuracy : 93.46
------
Save Model(Epoch: 8, Accuracy: 93.78)
epoch : 8 | loss : 0.233292
Accuracy : 93.78
------
Save Model(Epoch: 9, Accuracy: 93.85)
epoch : 9 | loss : 0.216755
Accuracy : 93.85
------
Save Model(Epoch: 10, Accuracy: 94.56)
epoch : 10 | loss : 0.202507
Accuracy : 94.56
------


In [54]:
import json
import bentoml
import numpy as np
import pandas as pd
from torch import cuda

In [56]:
# mlflow 모델 저장
mlflow.pytorch.save_model(model, './model_mlflow')

# bentoml model import
bentoml.mlflow.import_model("torch_mmnist_model", model_uri='./model_mlflow')

Model(tag="torch_mmnist_model:fco4nxswpsh6f2kn", path="/home/khkim/bentoml/models/torch_mmnist_model/fco4nxswpsh6f2kn/")

In [57]:
with mlflow.start_run():
    mlflow.pytorch.log_model(model, artifact_path="pytorch-model")

    model_uri = mlflow.get_artifact_uri("pytorch-model")
    bento_model = bentoml.mlflow.import_model(
        'mlflow_pytorch_mnist',
        model_uri,
        signatures={'predict': {'batchable': True}}
    )

In [58]:
!bentoml models list mlflow_pytorch_mnist

[1m [0m[1mTag                          [0m[1m [0m[1m [0m[1mModule        [0m[1m [0m[1m [0m[1mSize      [0m[1m [0m[1m [0m[1mCreation Time      [0m[1m [0m
 mlflow_pytorch_mnist:fwakrmc…  bentoml.mlflow  355.50 KiB  2022-10-28 13:51:25 
 mlflow_pytorch_mnist:5wff3nc…  bentoml.mlflow  355.50 KiB  2022-10-28 13:13:50 
 mlflow_pytorch_mnist:aninu4s…  bentoml.mlflow  355.50 KiB  2022-10-28 11:05:36 


In [59]:
with open('./test_input_arr.json', 'r') as f:
  test_input_arr = np.array(json.load(f), dtype=np.float32)
input_df = pd.DataFrame(test_input_arr)

In [60]:
bento_model = bentoml.mlflow.get("mlflow_pytorch_mnist:latest")
mlflow_model_path = bento_model.path_of(bentoml.mlflow.MLFLOW_MODEL_FOLDER)
device = 'cuda' if cuda.is_available() else 'cpu'

loaded_pytorch_model = mlflow.pytorch.load_model(mlflow_model_path)
loaded_pytorch_model.to(device)
loaded_pytorch_model.eval()
with torch.no_grad():
    input_tensor = torch.from_numpy(test_input_arr).to(device)
    predictions = loaded_pytorch_model(input_tensor)

In [61]:
pyfunc_model: mlflow.pyfunc.PyFuncModel = bentoml.mlflow.load_model("mlflow_pytorch_mnist:latest")
predictions = pyfunc_model.predict(test_input_arr)

In [62]:
runner = bentoml.mlflow.get("mlflow_pytorch_mnist:latest").to_runner()
runner.init_local()
runner.predict.run(input_df)

'Runner.init_local' is for debugging and testing only.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.380183,-6.570198,3.705701,5.251037,-6.438535,0.700532,-10.802313,10.078778,-0.238268,2.46114
1,3.148345,-0.520767,10.006227,5.027098,-10.536626,4.742851,4.861799,-10.102612,3.07754,-9.618299
2,-5.20793,6.709457,1.573217,0.601503,-1.886631,-0.769723,-0.192309,1.179303,0.823319,-2.092222


In [63]:
!bentoml serve service:svc --reload

2022-10-28T13:51:53+0900 [INFO] [cli] Prometheus metrics for HTTP BentoServer from "service:svc" can be accessed at http://localhost:3000/metrics.
2022-10-28T13:51:53+0900 [INFO] [cli] Starting development HTTP BentoServer from "service:svc" running on http://0.0.0.0:3000 (Press CTRL+C to quit)
2022-10-28 13:51:54 circus[2687206] [INFO] Loading the plugin...
2022-10-28 13:51:54 circus[2687206] [INFO] Endpoint: 'tcp://127.0.0.1:54021'
2022-10-28 13:51:54 circus[2687206] [INFO] Pub/sub: 'tcp://127.0.0.1:58045'
2022-10-28T13:51:54+0900 [INFO] [observer] Watching directories: ['/home/khkim/aip/AIP', '/home/khkim/bentoml/models']
^C
