In [1]:
from kubeflow import fairing

DOCKER_REGISTRY = 'www.dolearn.io:30003/kade-kubeflow'

In [2]:
def train_with_fairing():
    
    import numpy as np
#     import matplotlib.pyplot as plt
    import datetime, os

    import torch
    from torchvision import transforms, datasets
    from torch.utils.data import DataLoader
    from torch.utils.tensorboard import SummaryWriter

    from torch import nn
    import torch.nn.functional as F
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))
    
    train_data = datasets.FashionMNIST(
    root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
    )

    test_data = datasets.FashionMNIST(
        root="/home/jovyan/mlops-kubeflow/data/FashionMNIST",
        train=False,
        download=True,
        transform=transforms.ToTensor(),
    )
    
    BATCH_SIZE = 32
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

    for (x_train, y_train) in train_dataloader:
        print("Shape of X [N, C, H, W]: ", x_train.shape)
        print("Shape of y: ", y_train.shape, y_train.dtype)
        break

#     plt.figure(figsize=(10, 1))
#     for i in range(10):
#         plt.subplot(1, 10, i + 1)
#         plt.imshow(x_train[i, :, :, :].numpy().reshape(28, 28), cmap = "gray_r")
#         plt.title("class: " + str(y_train[i].item()))
#         plt.axis("off")

    class NeuralNetwork(nn.Module):

        def __init__(self):
            super(NeuralNetwork, self).__init__()

            self.flatten = nn.Flatten()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(28*28, 512),
                nn.ReLU(),
                nn.Linear(512, 512),
                nn.ReLU(),
                nn.Linear(512, 10),
                nn.ReLU()
            )

        def forward(self, x):
            x = self.flatten(x)
            logits = self.linear_relu_stack(x)
            output = F.log_softmax(logits, dim=1)
            return output

    model = NeuralNetwork().to(device)
    print(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

    def train(dataloader, model, loss_fn, optimizer):
        size = len(dataloader.dataset)
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # 예측 오류 계산
            pred = model(X)
            loss = loss_fn(pred, y)

            # 역전파
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 500 == 0:
                loss, current = loss.item(), batch * len(X)
                # 텐서보드에 Train Loss / per epoch 로그 기록 
                writer.add_scalar('Train/Loss', loss, t+1)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            

    def test(dataloader, model, loss_fn):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        model.eval()
        test_loss, correct = 0, 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(device), y.to(device)
                pred = model(X)
                test_loss += loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        test_loss /= num_batches
        correct /= size
        test_accuracy = 100. * correct 
        # 텐서보드에 Test 로그 기록
        writer.add_scalar('Test/Loss', test_loss, t+1)
        writer.add_scalar('Test/Accuracy', test_accuracy, t+1)
        writer.flush()
        print(f"Test Result: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    date_folder = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    # 분기설정 
    if os.getenv('FAIRING_RUNTIME', None) is None:
        log_dir = "/home/jovyan/log/fit/" + date_folder
    else:
        log_dir = "/home/jovyan/job/log/fit/" + date_folder  
        
    print(f"tensorboard log dir : {log_dir}")
   
    writer = SummaryWriter(log_dir)
    epochs = 1

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)


    print("Done!")

In [3]:
fairing.config.set_builder(
    'append',
    image_name='fairing-job',
    registry=DOCKER_REGISTRY,
    base_image='www.dolearn.io:30003/base/fairing-base:0.0.2')

fairing.config.set_deployer('job',
                            cleanup=False)



In [None]:
if __name__ == '__main__':
    print('remote train_with_fairing')
    remote_train = fairing.config.fn(train_with_fairing)
    remote_train()

[I 211119 06:09:06 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.function.FunctionPreProcessor object at 0x7f9ea9f30320>
[I 211119 06:09:06 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f9ec080b438>
[I 211119 06:09:06 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7f9ecf62d668>
[W 211119 06:09:06 append:52] Building image using Append builder...
[I 211119 06:09:06 base:112] Creating docker context: /tmp/fairing_context_wj0tq0ag
[W 211119 06:09:06 base:99] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 211119 06:09:06 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/base/fairing-base:0.0.2'


remote train_with_fairing
Image name :  www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB


[W 211119 06:09:06 append:56] Image successfully built in 0.4941397490001691s.
[W 211119 06:09:06 append:98] Pushing image www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB...
[I 211119 06:09:06 docker_creds_:234] Loading Docker credentials for repository 'www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB'
[W 211119 06:09:06 append:85] Uploading www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB
[I 211119 06:09:06 docker_session_:280] Layer sha256:29d136a889d232058c476b5637c18cbfca74c586634cbee07fe71fa540c7b211 exists, skipping


Image name :  www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB


[I 211119 06:09:07 docker_session_:280] Layer sha256:208b1b1d503e89fb2452c622d99f8a69b643819c098688dd89a4bce51d843f7d exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:8bcf82863cb9582a24dc32cd3ddf560ff2f84df88694be072758159b94b70bd3 exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:0269b6883f78a00bb29875d37fe3d838dbbe61cadf0108145fff2be316364f74 exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:f571d568b0961b0954a50f361ad842acab3b6e4b21a27430e172a1f0d5aca5db exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:7ada0795a7988a0d48120cfe85bc57dba3bdd225474db83b4e5565b4af8dd0a9 exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:7a12503ba844465b2c5aea7ebf60dd5057c7fcece51ea15e5f7f02ed1ae08d12 exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:3caed8c8884bf3a0cd5255f42fec14c219153bcdf294c81cb2e0599298c8a8df exists, skipping
[I 211119 06:09:07 docker_session_:280] Layer sha256:02

Building image www.dolearn.io:30003/kade-kubeflow/fairing-job:DC63DEBB done.


[W 211119 06:09:09 manager:298] Waiting for fairing-job-ld28p-xvn9p to start...
[I 211119 06:09:11 manager:304] Pod started running True


0.8%
1.6%
2.4%
3.2%
4.0%
4.8%
5.6%
6.3%
7.1%
7.9%
8.7%
9.5%
10.3%
10.9%
11.6%
12.2%
12.9%
13.6%
14.2%
14.9%
15.5%
16.2%
16.9%
17.5%
18.2%
18.9%
19.5%
20.2%
20.8%
21.5%
22.2%
22.8%
23.5%
24.1%
24.8%
25.5%
26.1%
26.8%
27.5%
28.1%
28.8%
29.4%
30.1%
30.8%
31.4%
32.1%
32.7%
33.4%
34.1%
34.7%
35.4%
36.1%
36.7%
37.4%
38.0%
38.7%
39.4%
40.0%
40.7%
41.3%
42.0%
42.7%
43.3%
44.0%
