In [None]:
# 구글 드라이브 연동

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 모듈들 import

import os
from glob import glob
import copy

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchvision import transforms, utils
from torchsummary import summary

In [None]:
# GPU

device = None

if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

print('Using PyTorch version:', torch.__version__, 'Device:', device)

Using PyTorch version: 1.8.0+cu101 Device: cuda


In [None]:
# 데이터 불러오기

data_path = './drive/MyDrive/northpole/data/'
train_data_path = os.path.join(data_path, "train")

print(train_data_path)

./drive/MyDrive/northpole/data/train


In [None]:
# train data 파일 리스트 정렬

file_list = os.listdir(train_data_path)
file_list.sort()

In [None]:
## Custom Dataset which return x_frames, y_frames

torch.set_printoptions(threshold=10000)
# show all tensor without abbreviation
# 설명은 여기에 https://pytorch.org/docs/stable/generated/torch.set_printoptions.html


# 482장의 데이터파일을 8개씩 묶어 6장을 통해 뒤의 2장을 예측할 수 있도록 분리하여 저장

class SeaIceDataset(Dataset):
    def __init__(self, data_dir, transform, data_type="train", frame_num=6, predict_num=2, stride=1):
        super(SeaIceDataset, self).__init__()

        """
        data_dir                => data folder path
        transform               => data to tensor

        data_type="train"       => choose train / valid / test
        그치. data_dir랑 data_type를 합칠거니까.

        482장의 데이터파일을 8개씩 묶어 6장을 통해 뒤의 2장을 예측할 수 있도록 분리하여 저장

        frame_num               => frame nums to use on train 
        # 이거 6장이랑
        
        predict_num             => frame nums to predict
        # 이거 2장

        stride_num              => stride for frames (if stride=2 => 197811.npy, 198001.npy, 198003.npy ... )
                                만약 8월끼리 비교하고 싶다면 stride = 12 를 넣어준다.  
        """

        # 데이터 경로 지정해주고.
        data_to_path = os.path.join(data_dir, data_type)
        
        # 파일 이름 불러오고
        filenames = os.listdir(data_to_path)
        
        # 파일 각각을 불러오려고
        # sorted(filenames)를 통해 파일이 정렬된 상태에서 파일 각각을 불러옴.
        self.filepaths = [os.path.join(data_to_path, filename) for filename in sorted(filenames)]
        
        # Numpy 배열을 Tensor 배열로 바꿔주는 함수
        self.transform = transform

        self.frame_num = frame_num 
        self.predict_num = predict_num
        self.stride = stride

    def __len__(self):
        # len = dataset으로 시작가능한 인덱스 번호 
        return len(self.filepaths) - (self.frame_num + self.predict_num - 1) * self.stride
    
    def __getitem__(self, idx):
        """
        it will return (x_with_frame_num, y_true_with_predict_num)
        if frame_num = 6, predict_num = 2
        ((6, 1, 448, 304), (2, 1, 448, 304))
        """
        dataset = []

        # idx부터 idx + 6 + 2로, stride만큼 진행해준다. stride를 1로 지정함.
        for id in range(idx, idx + self.frame_num + self.predict_num, self.stride):

            cur_npy = np.load(self.filepaths[id])[:,:,0]/250    # 250을 나눠주어 저장하지 않으면 toTensor했을때 오차값이 크게 생겼습니다
            cur_tensor = self.transform(cur_npy)                # tensor로 저장
            dataset.append(cur_tensor)

        # self.frame_num = 6이므로, :6 6개
        x = torch.stack(dataset[:self.frame_num])
        x = x.transpose(0,1).to(dtype=torch.float)              # [1, 6, 448, 304] => [channel, frames, height, width]
        # transpose는 2개의 차원을 변경하는데 사용
        # https://pytorch.org/docs/stable/generated/torch.transpose.html

        # 원래 (6, 1, 448, 304)이었는데, 이걸 보기 쉽게 [1, 6, 448, 304]로 변경.

        
        # 6부터 8까지, 2개 6:
        y = torch.stack(dataset[self.frame_num:])               
        y = y.transpose(0,1)                                    # [1, 2, 448, 304] => [channel, frames, height, width]
        
        # 원래 (2, 1, 448, 304)이었는데, 이걸 보기 쉽게 [1, 2, 448, 304]로 변경.

        return x, y

# Tensor 형식으로 transform
def getTransform():
    return transforms.Compose([transforms.ToTensor()])



transform = getTransform()

ice_dataset = SeaIceDataset(data_path, transform, "train", 6, 2, 1)

a,b = ice_dataset[1]        # sample to see 
print(len(ice_dataset))     # 데이터셋에 있는 총 데이터의 개수는 8개씩 묶여있는 475개의 데이터가 있습니다
print(a.shape, b.shape)

475
torch.Size([1, 6, 448, 304]) torch.Size([1, 2, 448, 304])


In [None]:
a,b = ice_dataset[5]
print(len(ice_dataset))
print(a.shape, b.shape)

475
torch.Size([1, 6, 448, 304]) torch.Size([1, 2, 448, 304])


In [None]:
# Data를 Train과 Valid로 나눈다.
# Train / Valid / Test로 나누는 이유는 overfitting을 방지하기 위함도 있음.

len_ice_dataset = len(ice_dataset)

len_ice_train = int(0.8 * len_ice_dataset)
len_ice_valid = len_ice_dataset - len_ice_train

train_dataset, valid_dataset = random_split(ice_dataset, [len_ice_train, len_ice_valid])
# random_split은 말그대로 랜덤하게 데이터를 나누어주는것.
# https://pytorch.org/docs/stable/data.html

print(f"train dataset length : {len(train_dataset)}")
print(f"valid dataset length : {len(valid_dataset)}")

train dataset length : 380
valid dataset length : 95


In [None]:
BATCH_SIZE = 12

# Dataloader 클래스는 데이터셋에서 배치 개수만큼 뽑아서 제공해줌.
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# train_dataloader length => 32
# 32 * 12 = 384이므로. train data는 380개잖아.

for x, y in train_dataloader:
  print(x.shape, y.shape)
  break

torch.Size([12, 1, 6, 448, 304]) torch.Size([12, 1, 2, 448, 304])


In [None]:
MODEL_PARAMS = {
    "shape": (6, 1, 448, 304),
    "init_filters":8,
    "dropout_rate":0.5
}

In [None]:
# Creating Model

class CustomNet(nn.Module):
  def __init__(self, params):
    super(CustomNet, self).__init__()
    input_frames, input_channel, input_height, input_width = params["shape"]
    init_filters = params["init_filters"]
    self.dropout_rate = params["dropout_rate"]
    self.conv1 = nn.Conv3d(input_channel, init_filters, kernel_size=3, padding=1)
    self.conv2 = nn.Conv3d(init_filters, init_filters*2, kernel_size=3, padding=1)
    self.conv3 = nn.ConvTranspose3d(init_filters*2, 1, kernel_size=3, padding=1)

  def forward(self, x):
    input = x
    x = F.relu(self.conv1(x))
    x = F.max_pool3d(x, 2, 2)
    x = F.relu(self.conv2(x))
    x = F.relu(self.conv3(x))
    x = F.upsample(x, size=(2, 448, 304))
    print("input: ", input.shape)
    print("output: ", x.shape)
    return x

In [None]:
my_model = CustomNet(MODEL_PARAMS).to(device)
print(my_model)

CustomNet(
  (conv1): Conv3d(1, 8, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (conv2): Conv3d(8, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (conv3): ConvTranspose3d(16, 1, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
)


In [None]:
# summary 함수를 통해 임의의 사이즈를 넣어 구조와 파라미터를 확인함

summary(my_model, input_size=(1, 6, 448, 304), device=device.type)

input:  torch.Size([2, 1, 6, 448, 304])
output:  torch.Size([2, 1, 2, 448, 304])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 8, 6, 448, 304]             224
            Conv3d-2      [-1, 16, 3, 224, 152]           3,472
   ConvTranspose3d-3       [-1, 1, 3, 224, 152]             433
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 3.12
Forward/backward pass size (MB): 63.12
Params size (MB): 0.02
Estimated Total Size (MB): 66.26
----------------------------------------------------------------




### 이 부분은 산식코드로 제공이 된 코드임.

In [None]:
# 음 이 부분은 산식코드로 제공이 된 코드네.

# Loss Function && etric Function

# metrics
def mae_score(true, pred):
    true, pred = numpy_to_tensor(true, pred)
    score = np.mean(np.abs(true-pred))
    
    return score

# F1 Score 설명
# https://eunsukimme.github.io/ml/2019/10/21/Accuracy-Recall-Precision-F1-score/

def f1_score(true, pred):
    true, pred = numpy_to_tensor(true, pred)

    target = np.where((1*0.05 < true)&(true < 1*0.5))
    # target = np.where((true>1*0.05)<1*0.5))
    
    true = true[target]
    pred = pred[target]

    # true가 1*0.15보다 작을 경우 0을, 아니면 1을 반환
    true = np.where(true < 1*0.15, 0, 1)

    # pred가 1*0.15보다 작을 경우 0을, 아니면 1을 반환
    pred = np.where(pred < 1*0.15, 0, 1)
    
    # true * pred == 1인 모든 값을 더해줌.
    right = np.sum(true * pred == 1)
    
    # Precision은 모델이 True로 예측한 데이터 중 실제로 True인 데이터의 수
    # precision = TruePositives / (TruePositives + FalsePositives)
    precision = right / np.sum(true+1e-8)

    # recall은 실제로 True인 데이터를 True라고 인식한 데이터 수
    # recall = TruePositives / (TruePositives + FalseNegatives)
    recall = right / np.sum(pred+1e-8)

    # F1 score는 precision 과 recall의 조화평균
    # 2 * (precision * recall / precision + recall)
    score = 2 * precision*recall/(precision+recall+1e-8)
    
    return score

# loss function
def mae_over_f1(true, pred):
    mae = mae_score(true, pred)
    f1 = f1_score(true, pred)
    score = mae/(f1+1e-8)
    
    return score

def numpy_to_tensor(true, pred):
    return true.cpu().detach().numpy(), pred.cpu().detach().numpy()

In [None]:
# Optimizer - Adam

# Adam
opt_adam = optim.Adam(my_model.parameters(), lr=3e-4)

def get_lr(opt):
  for param_group in opt.param_groups:
    return param_group["lr"]
  
current_lr = get_lr(opt_adam)
print(f"current_lr = {current_lr}")

current_lr = 0.0003


In [None]:
# learning rate scheduler
lr_scheduler = ReduceLROnPlateau(opt_adam, mode="min", factor=0.5, patience=20, verbose=1)

# https://deep-deep-deep.tistory.com/56
# 모델의 개선이 없을 경우, Learning Rate를 조절해 모델의 개선을 유도하는 콜백함수

# monitor : ReduceLROnPlateau의 기준이 되는 값

# mode : min / max / auto - monitor 되는 값이 최대가 되어야 하는지, 최소가 되어야 하는지 결정.

# factor : Learning rate를 얼마나 감소시킬지 정하는 인자값
# 새로운 learning rate는 기존 learning rate * factor임.

# patience : Training이 진행됨에도 더이상 monitor되는 값의 개선이 없을 경우, 
# 최적의 monitor 값을 기준으로 몇 번의 epoch을 진행하고, learning rate를 조절할 지의 값
# 예를 들어 patience는 3이고, 30에폭에 정확도가 99%였을 때,
# 만약 31번째에 정확도 98%, 32번째에 98.5%, 33번째에 98%라면 
# 모델의 개선이 (patience=3)동안 개선이 없었기에,  
# ReduceLROnPlateau 콜백함수를 실행함.

# verbose : 0 또는 1
# 1일 경우, EarlyStopping이 적용될 때 화면에 적용되었다고 나타냄.
# 0일 경우, 화면에 나타냄 없이 종료. 


In [None]:
for i in range(100):
  lr_scheduler.step(1)

Epoch    22: reducing learning rate of group 0 to 1.5000e-04.
Epoch    43: reducing learning rate of group 0 to 7.5000e-05.
Epoch    64: reducing learning rate of group 0 to 3.7500e-05.
Epoch    85: reducing learning rate of group 0 to 1.8750e-05.


In [None]:
# Training 

def metrics_batch(pred, true, metrics):
    # if needed add param "metrics" to custom
    """
    output will be pred
    target will be corrects
    """
    if metrics:
        return list(map(lambda x: x(true, pred), metrics))
    mae_score = mae_score(true, pred)
    f1_score = f1_score(true, pred)
    return (mae_score, f1_score)

def loss_batch(loss_func, pred, true, opt=None):
    """
    loss_func => mae_over_f1
    """
    loss = loss_func(true, pred)
    with torch.no_grad():
        metrics = metrics_batch(pred, true, [mae_score, f1_score])
    if opt is not None:
        opt.zero_grad()
        # loss.backward()
        opt.step()  # 학습이 이뤄지는 곳
    return loss, metrics

def loss_epoch(model, loss_func, dataset_dataloader, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = [0.0, 0.0]
    len_data = len(dataset_dataloader.dataset)

    for x, y in dataset_dataloader:
        x = x.to(device)
        y = y.to(device)
        # 모델 결과
        pred = model(x)
        # 손실함수 구하기
        loss, metrics = loss_batch(loss_func, pred, y, opt)
        # 손실함수 
        running_loss += loss
        if metrics is not None:
            for idx, metric_value in enumerate(metrics):
                running_metric[idx] += metric_value
        
        # 문제 있으면 break, 여기서는 True 일때 바로 break
        if sanity_check is True:
            break
    
    loss = running_loss / float(len_data)
    metrics = list(map(lambda x: x/float(len_data), metrics))
    print(loss, metrics)
    return loss, metrics

In [None]:
loss_func = mae_over_f1
opt_adam = optim.Adam(my_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt_adam, mode="min", factor=0.5, patience=20, verbose=1)

TRAIN_PARAMS = {
    "num_epochs" : 10,
    "loss_func" : loss_func,
    "optimizer" : opt_adam,
    "train_dataloader" : train_dataloader,
    "valid_dataloader" : valid_dataloader,
    "sanity_check" : True,
    "lr_scheduler" : lr_scheduler,
    "save_path" : "./weights.pt"
}

In [None]:
def train(model, params):
    num_epochs = params['num_epochs']
    loss_func = params['loss_func']
    opt = params["optimizer"]
    train_dataloader = params['train_dataloader']
    valid_dataloader = params['valid_dataloader']
    sanity_check = params['sanity_check']
    lr_scheduler = params['lr_scheduler']
    save_path = params['save_path']

    # keep history of the loss and metric
    loss_hist = {
        "train" : [],
        "valid" : []
    }

    metrics_hist = {
        "train" : [],
        "valid" : []
    }

    # copy best weights
    best_model_weights = copy.deepcopy(model.state_dict())
    # init best loss
    best_loss = float("inf")

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print(f'Epoch:{epoch}/{num_epochs-1}, current lr:{current_lr}')
        model.train()
        train_loss, train_metrics = loss_epoch(model, loss_func, train_dataloader, sanity_check, opt)

        # save history
        loss_hist["train"].append(train_loss)
        metrics_hist["train"].append(train_metrics)

        # model.eval()
        # with torch.no_grad():
    

    return model, loss_hist, metrics_hist

In [None]:
my_model, loss_hist, metrics_hist = train(my_model, TRAIN_PARAMS)

Epoch:0/9, current lr:0.0003




input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.03127452489467292 [0.00039106634236059763, 3.290605161973221e-05]
Epoch:1/9, current lr:0.0003
input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.031274524891185866 [0.00039106634231699444, 3.290605161973221e-05]
Epoch:2/9, current lr:0.0003
input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.031274524889158106 [0.0003910663422916387, 3.290605161973221e-05]
Epoch:3/9, current lr:0.0003
input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.03127452488999857 [0.0003910663423021481, 3.290605161973221e-05]
Epoch:4/9, current lr:0.0003
input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.03127452489040393 [0.00039106634230721683, 3.290605161973221e-05]
Epoch:5/9, current lr:0.0003
input:  torch.Size([12, 1, 6, 448, 304])
output:  torch.Size([12, 1, 2, 448, 304])
0.031274524891711

In [None]:
print(loss_hist)

{'train': [0.03127452489467292, 0.031274524891185866, 0.031274524889158106, 0.03127452488999857, 0.03127452489040393, 0.031274524891711154, 0.031274524891863706, 0.03127452488837953, 0.031274524887878706, 0.03127452488766907], 'valid': []}
