In [1]:
%%writefile my_model.py
import numpy as np
import matplotlib.pyplot as plt
import datetime, os

import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from torch import nn
import torch.nn.functional as F

class MyModel(object):
    def run(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print("Using {} device".format(device))

        train_data = datasets.FashionMNIST(
        root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
        train=True,
        download=True,
        transform=transforms.ToTensor(),
        )

        test_data = datasets.FashionMNIST(
            root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
            train=False,
            download=True,
            transform=transforms.ToTensor(),
        )

        BATCH_SIZE = 32
        train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

        for (x_train, y_train) in train_dataloader:
            print("Shape of X [N, C, H, W]: ", x_train.shape)
            print("Shape of y: ", y_train.shape, y_train.dtype)
            break

        class NeuralNetwork(nn.Module):

            def __init__(self):
                super(NeuralNetwork, self).__init__()

                self.flatten = nn.Flatten()
                self.linear_relu_stack = nn.Sequential(
                    nn.Linear(28*28, 512),
                    nn.ReLU(),
                    nn.Linear(512, 512),
                    nn.ReLU(),
                    nn.Linear(512, 10),
                    nn.ReLU()
                )

            def forward(self, x):
                x = self.flatten(x)
                logits = self.linear_relu_stack(x)
                output = F.log_softmax(logits, dim=1)
                return output

        model = NeuralNetwork().to(device)
        print(model)

        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

        def train(dataloader, model, loss_fn, optimizer):
            size = len(dataloader.dataset)
            for batch, (X, y) in enumerate(dataloader):
                X, y = X.to(device), y.to(device)

                # 예측 오류 계산
                pred = model(X)
                loss = loss_fn(pred, y)

                # 역전파
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if batch % 500 == 0:
                    loss, current = loss.item(), batch * len(X)
                    # 텐서보드에 Train Loss / per epoch 로그 기록 
                    writer.add_scalar('Train/Loss', loss, t+1)
                    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


        def test(dataloader, model, loss_fn):
            size = len(dataloader.dataset)
            num_batches = len(dataloader)
            model.eval()
            test_loss, correct = 0, 0
            with torch.no_grad():
                for X, y in dataloader:
                    X, y = X.to(device), y.to(device)
                    pred = model(X)
                    test_loss += loss_fn(pred, y).item()
                    correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            test_loss /= num_batches
            correct /= size
            test_accuracy = 100. * correct 
            # 텐서보드에 Test 로그 기록
            writer.add_scalar('Test/Loss', test_loss, t+1)
            writer.add_scalar('Test/Accuracy', test_accuracy, t+1)
            writer.flush()
            print(f"Test Result: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

        date_folder = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        # 분기설정 
        if os.getenv('FAIRING_RUNTIME', None) is None:
            log_dir = "/home/jovyan/log/fit/" + date_folder
        else:
            log_dir = "/home/jovyan/job/log/fit/" + date_folder  

        print(f"tensorboard log dir : {log_dir}")

        writer = SummaryWriter(log_dir)
        epochs = 1

        for t in range(epochs):
            print(f"Epoch {t+1}\n-------------------------------")
            train(train_dataloader, model, loss_fn, optimizer)
            test(test_dataloader, model, loss_fn)


        print("Done!")

Overwriting my_model.py


In [2]:
import os
from my_model import MyModel
from kubeflow import fairing
from kubeflow.fairing.kubernetes.utils import mounting_pvc

DOCKER_REGISTRY = 'www.dolearn.io:30003/kade-kubeflow'

In [3]:
def train_with_package():
    my_model = MyModel()
    my_model.run()

In [4]:
# output_map 에 key[현재경로의 파일이름]:value[컨테이너 안의 파일경로] 형태로 넣어줍니다.
output_map =  {
    "my_model.py": "/app/my_model.py"
}            

# preprocessor에서 ouput_map을 넣음으로써 fairing 패키지 안에 model_FashionMNIST.py가 들어가게 됩니다.
fairing.config.set_preprocessor("function", 
                                function_obj=train_with_package,
                                output_map=output_map)

In [5]:
fairing.config.set_builder(
    'append',
    image_name='fashionmnist-packagedjob', 
    base_image='www.dolearn.io:30003/base/fairing-base:0.0.2',
    registry=DOCKER_REGISTRY, 
    push=True)

In [6]:
# fairing mounting pvc 추가
notebook_volume = mounting_pvc(pvc_name="workspace-kade", 
                                pvc_mount_path="/home/jovyan") #마운트 경로 


fairing.config.set_deployer('job',
                            pod_spec_mutators=[notebook_volume],
                            cleanup=False) # 잡을 실행후 완료시 잡을 삭제할지의 여부를 결정

[W 211119 06:55:28 utils:51] The function mounting_pvc has been deprecated,                     please use `volume_mounts`


In [7]:
if __name__ == '__main__':
    fairing.config.run()

[I 211119 06:55:28 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x7f3abda86dd8>
[I 211119 06:55:28 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f3ad1b2f0f0>
[I 211119 06:55:28 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7f3ac7cb7a20>
[W 211119 06:55:28 append:52] Building image using Append builder...
[I 211119 06:55:28 base:112] Creating docker context: /tmp/fairing_context_p8j82tm1
[W 211119 06:55:28 base:99] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 211119 06:55:28 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/base/fairing-base:0.0.2'


Image name :  www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727


[W 211119 06:55:28 append:56] Image successfully built in 0.4822619769984158s.
[W 211119 06:55:28 append:98] Pushing image www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727...
[I 211119 06:55:28 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727'
[W 211119 06:55:28 append:85] Uploading www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727


Image name :  www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727


[I 211119 06:55:28 docker_session_:280] Layer sha256:3cf8fb62ba5ffb221a2edb2208741346eb4d2d99a174138e4afbb69ce1fd9966 exists, skipping
[I 211119 06:55:28 docker_session_:280] Layer sha256:29d136a889d232058c476b5637c18cbfca74c586634cbee07fe71fa540c7b211 exists, skipping
[I 211119 06:55:28 docker_session_:280] Layer sha256:0269b6883f78a00bb29875d37fe3d838dbbe61cadf0108145fff2be316364f74 exists, skipping
[I 211119 06:55:28 docker_session_:280] Layer sha256:3caed8c8884bf3a0cd5255f42fec14c219153bcdf294c81cb2e0599298c8a8df exists, skipping
[I 211119 06:55:28 docker_session_:280] Layer sha256:641afa4edc436e3fd3efd40433f1ad0c55b48af949680cd2359de51e3c439699 exists, skipping
[I 211119 06:55:29 docker_session_:280] Layer sha256:02842a89d653002ea6c32f5573a9cec312ace226dae5eea21bc68782f4e2f627 exists, skipping
[I 211119 06:55:29 docker_session_:280] Layer sha256:f5098a9bf4490bccac9085b1bf9c54baf3015333c40fb6685889a9785b7388ee exists, skipping
[I 211119 06:55:29 docker_session_:280] Layer sha256:40

Building image www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:5BE42727 done.


[W 211119 06:55:31 manager:298] Waiting for fairing-job-bn2qq-bljnb to start...
[I 211119 06:55:32 manager:304] Pod started running True


Using cpu device
Shape of X [N, C, H, W]:  torch.Size([32, 1, 28, 28])
Shape of y:  torch.Size([32]) torch.int64
NeuralNetwork(
  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)
tensorboard log dir : /home/jovyan/job/log/fit/20211119-065533
Epoch 1
-------------------------------
loss: 2.308242  [    0/60000]
loss: 2.266033  [16000/60000]
loss: 2.172069  [32000/60000]
loss: 2.249936  [48000/60000]
Test Result:
 Accuracy: 51.2%, Avg loss: 2.074145

Done!
