In [1]:
from kubeflow import fairing

DOCKER_REGISTRY = 'www.dolearn.io:30003/kade-kubeflow'

In [2]:
def train_with_fairing():
    
    import numpy as np
#     import matplotlib.pyplot as plt
    import datetime, os

    import torch
    from torchvision import transforms, datasets
    from torch.utils.data import DataLoader
    from torch.utils.tensorboard import SummaryWriter

    from torch import nn
    import torch.nn.functional as F
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))
    
    train_data = datasets.FashionMNIST(
    root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
    )

    test_data = datasets.FashionMNIST(
        root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
        train=False,
        download=True,
        transform=transforms.ToTensor(),
    )
    
    BATCH_SIZE = 32
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

    for (x_train, y_train) in train_dataloader:
        print("Shape of X [N, C, H, W]: ", x_train.shape)
        print("Shape of y: ", y_train.shape, y_train.dtype)
        break

#     plt.figure(figsize=(10, 1))
#     for i in range(10):
#         plt.subplot(1, 10, i + 1)
#         plt.imshow(x_train[i, :, :, :].numpy().reshape(28, 28), cmap = "gray_r")
#         plt.title("class: " + str(y_train[i].item()))
#         plt.axis("off")

    class NeuralNetwork(nn.Module):

        def __init__(self):
            super(NeuralNetwork, self).__init__()

            self.flatten = nn.Flatten()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(28*28, 512),
                nn.ReLU(),
                nn.Linear(512, 512),
                nn.ReLU(),
                nn.Linear(512, 10),
                nn.ReLU()
            )

        def forward(self, x):
            x = self.flatten(x)
            logits = self.linear_relu_stack(x)
            output = F.log_softmax(logits, dim=1)
            return output

    model = NeuralNetwork().to(device)
    print(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

    def train(dataloader, model, loss_fn, optimizer):
        size = len(dataloader.dataset)
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # 예측 오류 계산
            pred = model(X)
            loss = loss_fn(pred, y)

            # 역전파
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 500 == 0:
                loss, current = loss.item(), batch * len(X)
                # 텐서보드에 Train Loss / per epoch 로그 기록 
                writer.add_scalar('Train/Loss', loss, t+1)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            

    def test(dataloader, model, loss_fn):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        model.eval()
        test_loss, correct = 0, 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(device), y.to(device)
                pred = model(X)
                test_loss += loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        test_loss /= num_batches
        correct /= size
        test_accuracy = 100. * correct 
        # 텐서보드에 Test 로그 기록
        writer.add_scalar('Test/Loss', test_loss, t+1)
        writer.add_scalar('Test/Accuracy', test_accuracy, t+1)
        writer.flush()
        print(f"Test Result: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    date_folder = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    # 분기설정 
    if os.getenv('FAIRING_RUNTIME', None) is None:
        log_dir = "/home/jovyan/log/fit/" + date_folder
    else:
        log_dir = "/home/jovyan/job/log/fit/" + date_folder  
        
    print(f"tensorboard log dir : {log_dir}")
   
    writer = SummaryWriter(log_dir)
    epochs = 3

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)


    print("Done!")

In [3]:
fairing.config.set_builder(
    'append',
    image_name='fairing-job',
    registry=DOCKER_REGISTRY,
    base_image='www.dolearn.io:30003/base/fairing-base:0.0.2')

fairing.config.set_deployer('job',
                            cleanup=False)

In [4]:
if __name__ == '__main__':
    print('remote train_with_fairing')
    remote_train = fairing.config.fn(train_with_fairing)
    remote_train()

[I 220123 12:38:06 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x7f951c9a3ac8>
[I 220123 12:38:06 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f95607899b0>
[I 220123 12:38:06 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7f9539f496d8>
[W 220123 12:38:06 append:52] Building image using Append builder...
[I 220123 12:38:06 base:112] Creating docker context: /tmp/fairing_context_u2gxe6sg
[W 220123 12:38:06 base:99] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 220123 12:38:06 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/base/fairing-base:0.0.2'


remote train_with_fairing
Image name :  www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D


[W 220123 12:38:07 append:56] Image successfully built in 0.7979426719248295s.
[W 220123 12:38:07 append:98] Pushing image www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D...
[I 220123 12:38:07 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D'
[W 220123 12:38:07 append:85] Uploading www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D


Image name :  www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D


[I 220123 12:38:07 docker_session_:280] Layer sha256:641afa4edc436e3fd3efd40433f1ad0c55b48af949680cd2359de51e3c439699 exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:a4dd3c805ec24b016ac8a3869add24541829736c312b65bd49d3b2af7501f897 exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:4bf23ae646f0b9d8e07bf427c69c82f208bb57a8b297507d9b8b6fa23b725711 exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:3caed8c8884bf3a0cd5255f42fec14c219153bcdf294c81cb2e0599298c8a8df exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:29d136a889d232058c476b5637c18cbfca74c586634cbee07fe71fa540c7b211 exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:063a4ff324e290814ea5bf23d5f8de5cca1a734782c4a187132ab3364b44a985 exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:f5098a9bf4490bccac9085b1bf9c54baf3015333c40fb6685889a9785b7388ee exists, skipping
[I 220123 12:38:08 docker_session_:280] Layer sha256:e8

Building image www.dolearn.io:30003/kade-kubeflow/fairing-job:D1EDD38D done.


[W 220123 12:38:10 manager:298] Waiting for fairing-job-ffdq8-g2w58 to start...
[I 220123 12:38:12 manager:304] Pod started running True


0.8%
1.6%
2.4%
3.2%
4.0%
4.8%
5.6%
6.3%
7.1%
7.9%
8.7%
9.5%
10.3%
10.9%
11.6%
12.2%
12.9%
13.6%
14.2%
14.9%
15.5%
16.2%
16.9%
17.5%
18.2%
18.9%
19.5%
20.2%
20.8%
21.5%
22.2%
22.8%
23.5%
24.1%
24.8%
25.5%
26.1%
26.8%
27.5%
28.1%
28.8%
29.4%
30.1%
30.8%
31.4%
32.1%
32.7%
33.4%
34.1%
34.7%
35.4%
36.1%
36.7%
37.4%
38.0%
38.7%
39.4%
40.0%
40.7%
41.3%
42.0%
42.7%
43.3%
44.0%
44.7%
45.3%
46.0%
46.6%
47.3%
48.0%
48.6%
49.3%
49.9%
50.6%
51.3%
51.9%
52.6%
53.3%
53.9%
54.6%
55.2%
55.9%
56.6%
57.2%
57.9%
58.5%
59.2%
59.9%
60.5%
61.2%
61.8%
62.5%
63.2%
63.8%
64.5%
65.2%
65.8%
66.5%
67.1%
67.8%
68.5%
69.1%
69.8%
70.4%
71.1%
71.8%
72.4%
73.1%
73.8%
74.4%
75.1%
75.7%
76.4%
77.1%
77.7%
78.4%
79.0%
79.7%
80.4%
81.0%
81.7%
82.4%
83.0%
83.7%
84.3%
85.0%
85.7%
86.3%
87.0%
87.6%
88.3%
89.0%
89.6%
90.3%
91.0%
91.6%
92.3%
92.9%
93.6%
94.3%
94.9%
95.6%
96.2%
96.9%
97.6%
98.2%
98.9%
99.5%
100.0%
100.6%
4.7%
9.5%
13.5%
17.5%
21.4%
25.4%
29.3%
33.3%
37.2%
41.2%
45.1%
49.1%
53.0%
57.0%
60.9%
64.9%
68.8%
72.8%
76.7