In [1]:
%%writefile my_model.py

import numpy as np
# import matplotlib.pyplot as plt
import datetime, os

import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from torch import nn
import torch.nn.functional as F

class MyModel(object):
    def run(self):    
        device = "cuda" if torch.cuda.is_available() else "cpu"
#         print("Using {} device".format(device))

        train_data = datasets.FashionMNIST(
            root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
            train=True,
            download=True,
            transform=transforms.ToTensor(),
        )

        test_data = datasets.FashionMNIST(
            root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
            train=False,
            download=True,
            transform=transforms.ToTensor(),
        )

        BATCH_SIZE = 32
        train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

#         for (x_train, y_train) in train_dataloader:
#             print("Shape of X [N, C, H, W]: ", x_train.shape)
#             print("Shape of y: ", y_train.shape, y_train.dtype)
#             break

        #     plt.figure(figsize=(10, 1))
        #     for i in range(10):
        #         plt.subplot(1, 10, i + 1)
        #         plt.imshow(x_train[i, :, :, :].numpy().reshape(28, 28), cmap = "gray_r")
        #         plt.title("class: " + str(y_train[i].item()))
        #         plt.axis("off")

        class NeuralNetwork(nn.Module):

            def __init__(self):
                super(NeuralNetwork, self).__init__()

                self.flatten = nn.Flatten()
                self.linear_relu_stack = nn.Sequential(
                    nn.Linear(28*28, 512),
                    nn.ReLU(),
                    nn.Linear(512, 512),
                    nn.ReLU(),
                    nn.Linear(512, 10),
                    nn.ReLU()
                )

            def forward(self, x):
                x = self.flatten(x)
                logits = self.linear_relu_stack(x)
                output = F.log_softmax(logits, dim=1)
                return output

        model = NeuralNetwork().to(device)
#         print(model)

        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

        def train(dataloader, model, loss_fn, optimizer):
            size = len(dataloader.dataset)
            for batch, (X, y) in enumerate(dataloader):
                X, y = X.to(device), y.to(device)

                # 예측 오류 계산
                pred = model(X)
                loss = loss_fn(pred, y)

                # 역전파
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if batch % 500 == 0:
                    loss, current = loss.item(), batch * len(X)
                    # 텐서보드에 Train Loss / per epoch 로그 기록 
                    writer.add_scalar('Train/Loss', loss, t+1)
                    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


        def test(dataloader, model, loss_fn):
            size = len(dataloader.dataset)
            num_batches = len(dataloader)
            model.eval()
            test_loss, correct = 0, 0
            with torch.no_grad():
                for X, y in dataloader:
                    X, y = X.to(device), y.to(device)
                    pred = model(X)
                    test_loss += loss_fn(pred, y).item()
                    correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            test_loss /= num_batches
            correct /= size
            test_accuracy = 100. * correct 
            # 텐서보드에 Test 로그 기록
            writer.add_scalar('Test/Loss', test_loss, t+1)
            writer.add_scalar('Test/Accuracy', test_accuracy, t+1)
            writer.flush()
            print(f"Test Result: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

        date_folder = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        # 분기설정 
        if os.getenv('FAIRING_RUNTIME', None) is None:
            log_dir = "/home/jovyan/log/fit/" + date_folder
        else:
            log_dir = "/home/jovyan/job/log/fit/" + date_folder  

        print(f"tensorboard log dir : {log_dir}")

        writer = SummaryWriter(log_dir)
        epochs = 1

        for t in range(epochs):
            print(f"Epoch {t+1}\n-------------------------------")
            train(train_dataloader, model, loss_fn, optimizer)
            test(test_dataloader, model, loss_fn)


        print("Done!")

Writing my_model.py


In [2]:
import os
from my_model import MyModel
from kubeflow import fairing
from kubeflow.fairing.kubernetes.utils import mounting_pvc

DOCKER_REGISTRY = 'www.dolearn.io:30003/kade-kubeflow'

In [3]:
def train_with_package():
    my_model = MyModel()
    my_model.run()   

In [4]:
# output_map 에 key[현재경로의 파일이름]:value[컨테이너 안의 파일경로] 형태로 넣어줍니다.
output_map =  {
    "my_model.py": "/app/my_model.py"
}            

# preprocessor에서 ouput_map을 넣음으로써 fairing 패키지 안에 model_FashionMNIST.py가 들어가게 됩니다.
fairing.config.set_preprocessor("function", 
                                function_obj=train_with_package,
                                output_map=output_map)        


In [5]:
fairing.config.set_builder(
    'append',
    image_name='fashionmnist-packagedjob', 
    base_image='www.dolearn.io:30003/base/fairing-base:0.0.2',
    registry=DOCKER_REGISTRY, 
    push=True)

In [6]:
# fairing mounting pvc 추가
notebook_volume = mounting_pvc(pvc_name="workspace-kade", 
                                pvc_mount_path="/home/jovyan") #마운트 경로 /notebook 


fairing.config.set_deployer('job',
                            pod_spec_mutators=[notebook_volume],
                            cleanup=False) # 잡을 실행후 완료시 잡을 삭제할지의 여부를 결정


[W 211119 04:31:00 utils:51] The function mounting_pvc has been deprecated,                     please use `volume_mounts`


In [7]:
if __name__ == '__main__':
    fairing.config.run()

[I 211119 04:31:00 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x7fa3ec375208>
[I 211119 04:31:00 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7fa3fdac45c0>
[I 211119 04:31:00 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7fa3f8bd55c0>
[W 211119 04:31:00 append:52] Building image using Append builder...
[I 211119 04:31:00 base:112] Creating docker context: /tmp/fairing_context_1fi7hi2z
[W 211119 04:31:00 base:99] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 211119 04:31:00 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/base/fairing-base:0.0.2'


Image name :  www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19


[W 211119 04:31:01 append:56] Image successfully built in 0.5640066569994815s.
[W 211119 04:31:01 append:98] Pushing image www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19...
[I 211119 04:31:01 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19'
[W 211119 04:31:01 append:85] Uploading www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19
[I 211119 04:31:01 docker_session_:280] Layer sha256:7ada0795a7988a0d48120cfe85bc57dba3bdd225474db83b4e5565b4af8dd0a9 exists, skipping


Image name :  www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19


[I 211119 04:31:01 docker_session_:280] Layer sha256:e1b8f4d5dcdfb4ac873d37d3a643cba6a55f2b325cfe0115aaba32946e896e0a exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:f571d568b0961b0954a50f361ad842acab3b6e4b21a27430e172a1f0d5aca5db exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:063a4ff324e290814ea5bf23d5f8de5cca1a734782c4a187132ab3364b44a985 exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:7a12503ba844465b2c5aea7ebf60dd5057c7fcece51ea15e5f7f02ed1ae08d12 exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:3cf8fb62ba5ffb221a2edb2208741346eb4d2d99a174138e4afbb69ce1fd9966 exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:3caed8c8884bf3a0cd5255f42fec14c219153bcdf294c81cb2e0599298c8a8df exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:29d136a889d232058c476b5637c18cbfca74c586634cbee07fe71fa540c7b211 exists, skipping
[I 211119 04:31:01 docker_session_:280] Layer sha256:20

Building image www.dolearn.io:30003/kade-kubeflow/fashionmnist-packagedjob:7019CB19 done.


[W 211119 04:31:02 manager:298] Waiting for fairing-job-7qwsz-wgntv to start...
[W 211119 04:31:03 manager:298] Waiting for fairing-job-7qwsz-wgntv to start...
[I 211119 04:31:05 manager:304] Pod started running True


tensorboard log dir : /notebook/log/fit/20211119-043106
Epoch 1
-------------------------------
loss: 2.305484  [    0/60000]
loss: 2.292389  [16000/60000]
loss: 2.291755  [32000/60000]
loss: 2.251377  [48000/60000]
Test Result:
 Accuracy: 23.6%, Avg loss: 2.248289

Done!
  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
