## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import glob

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device :',device)

device : cuda


## Hyperparameter Setting

In [5]:
CFG = {
    'EPOCHS':1024,
    'PATIENCE':30,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':16,
    'SEED':42,
}

## Fixed RandomSeed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [7]:
all_input_list  = sorted(glob.glob('./data/train_input/*.csv'))
all_target_list = sorted(glob.glob('./data/train_target/*.csv'))
all_test_list   = sorted(glob.glob('./data/test_input/*.csv'))

## CustomDataset

In [8]:
# class CustomDataset(Dataset):
#     def __init__(self, input_paths, target_paths, infer_mode):
#         self.input_paths = input_paths
#         self.target_paths = target_paths
#         self.infer_mode = infer_mode
        
#         self.data_list = []
#         self.label_list = []
#         print('Data Pre-processing..')
#         for input_path, target_path in tqdm(zip(self.input_paths, self.target_paths)):
#             input_df = pd.read_csv(input_path)
#             target_df = pd.read_csv(target_path)
            
#             input_df = input_df.drop(columns=['obs_time'])
#             input_df = input_df.fillna(0)
            
#             input_length = int(len(input_df)/24)
#             target_length = int(len(target_df))
            
#             for idx in range(target_length):
#                 time_series = input_df[24*idx:24*(idx+1)].values
#                 self.data_list.append(torch.Tensor(time_series))
#             for label in target_df["predicted_weight_g"]:
#                 self.label_list.append(label)
#         print('Done.')
              
#     def __getitem__(self, index):
#         data = self.data_list[index]
#         label = self.label_list[index]
#         if self.infer_mode == False:
#             return data, label
#         else:
#             return data
        
#     def __len__(self):
#         return len(self.data_list)

In [9]:
from sklearn.preprocessing import MinMaxScaler

class CustomDataset:
    def __init__(self, input_paths, target_paths, test_paths):
        self.input_paths = input_paths
        self.target_paths = target_paths
        self.test_paths = test_paths
        
        self.input = None
        self.label = None
        self.test  = None

        input_fn = []
        label_fn = []
        for input_path, target_path in zip(self.input_paths, self.target_paths):
            input_df  = pd.read_csv(input_path)
            target_df = pd.read_csv(target_path)

            input_df = input_df.drop(columns=['obs_time'])
            input_df = input_df.fillna(0)

            input_fn.append(input_df)
            label_fn.append(target_df)
        
        test_fn = []
        for test_path in self.test_paths:
            test_df = pd.read_csv(test_path)
            test_df = test_df.drop(columns=['obs_time'])
            test_fn.append(test_df)
            
        self.input = pd.concat(input_fn,axis=0)
        self.label = pd.concat(label_fn,axis=0)
        self.test  = pd.concat(test_fn ,axis=0)

    def _train_test_split(self, val_rate):
        val_size = int(self.input.DAT.nunique() * val_rate)

        tr_idx = self.input.DAT <  max(self.input.DAT)-val_size
        va_idx = self.input.DAT >= max(self.input.DAT)-val_size
        
        X_train = self.input[tr_idx]
        X_valid = self.input[va_idx]
        y_train = self.label[self.label.DAT.isin(X_train.DAT.unique()+1)]
        y_valid = self.label[self.label.DAT.isin(X_valid.DAT.unique()+1)]

        print(f'val_rate={val_rate}, val_size={val_size}')
        print(f'train DAT : [{X_train.DAT.min()}~{X_train.DAT.max()}], validation DAT : [{X_valid.DAT.min()}~{X_valid.DAT.max()}]')
        
        return X_train, X_valid, y_train, y_valid
    
    def _scale_dataset(self,X_train,X_valid):
        
        scalers = {}
        # for train dataset
        for col in X_train.columns:
            _scaler = MinMaxScaler()
            X_train[col] = _scaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
            scalers[col] = _scaler
            
        # for validation dataset
        for col in X_valid.columns:
            _scaler = scalers[col]
            X_valid[col] = _scaler.transform(np.array(X_valid[col]).reshape(-1,1))
            
        # for test dataset
        X_test = self.test.copy()
        for col in X_test.columns:
            _scaler = scalers[col]
            X_test[col] = _scaler.transform(np.array(X_test[col]).reshape(-1,1))
            
        return X_train, X_valid, X_test

    def _transform_dataset(self,y_train,y_valid):
        y_train['predicted_weight_g'] = np.log(y_train['predicted_weight_g'])
        y_valid['predicted_weight_g'] = np.log(y_valid['predicted_weight_g'])
        
        return y_train, y_valid

In [27]:
val_rate = 0.1

dataset = CustomDataset(
    input_paths = all_input_list,
    target_paths = all_target_list,
    test_paths = all_test_list,
)

X_train, X_valid, y_train, y_valid = dataset._train_test_split(val_rate=val_rate)

X_train, X_valid, X_test = dataset._scale_dataset(X_train,X_valid)
# y_train, y_valid = dataset._transform_dataset(y_train,y_valid)

val_rate=0.1, val_size=2
train DAT : [0~24], validation DAT : [25~27]


In [28]:
class TorchDataset(Dataset):
    def __init__(self,X,y,infer_mode):
        self.X = X
        self.y = y.sort_values('DAT')
        self.infer_mode = infer_mode
        
        self.X_ret = []
        self.y_ret = []
        
        for x_idx, y_idx in zip(sorted(X.DAT.unique()),sorted(y.DAT.unique())):
            X_dat = X[X.DAT==x_idx]
            y_dat = y[y.DAT==y_idx][['predicted_weight_g']].values[0][0]
            
            self.X_ret.append(torch.Tensor(X_dat.values))
            self.y_ret.append(y_dat)
            
        # self.X_ret = np.array(self.X_ret)
        # self.y_ret = np.array(self.y_ret).reshape(-1)
        
            
    def __getitem__(self, index):
        data  = self.X_ret[index]
        label = self.y_ret[index]
        if self.infer_mode == False:
            return data, label
        else:
            return data

    def __len__(self):
        return len(self.X_ret)

In [29]:
np.array(X_train).shape

(16800, 15)

In [30]:
train_dataset = TorchDataset(X_train, y_train, False)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=6)

val_dataset = TorchDataset(X_valid, y_valid, False)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=6)

test_dataset = TorchDataset(X_test, y_valid, True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Model Define

In [31]:
# class BaseModel(nn.Module):
#     def __init__(self):
#         super(BaseModel, self).__init__()
#         self.lstm = nn.LSTM(input_size=15, hidden_size=256, batch_first=True, bidirectional=False)
#         self.classifier = nn.Sequential(
#             nn.Linear(256, 1),
#         )

#     def forward(self, x):
#         hidden, _ = self.lstm(x)
#         output = self.classifier(hidden[:,-1,:])
#         return output

In [32]:
# https://coding-yoon.tistory.com/131

In [33]:
class BaseModel(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropout_rates, num_classes, num_layers):
        super(BaseModel, self).__init__()

        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.dropout_rates = dropout_rates
        self.num_classes = num_classes
        self.num_layers = num_layers

        self.lstm1 = nn.LSTM(
            input_size=self.input_size,
            hidden_size=self.hidden_sizes[0],
            batch_first=True,
            bidirectional=True,
            dropout=self.dropout_rates[0],
            num_layers=self.num_layers,
        )
        self.lstm2 = nn.LSTM(
            input_size=self.hidden_sizes[0]*2, # bidirectional
            hidden_size=self.hidden_sizes[1],
            batch_first=True,
            bidirectional=True,
            dropout=self.dropout_rates[1],
            num_layers=self.num_layers,
        )

        self.fc = nn.Linear(self.hidden_sizes[1]*2, self.num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        hid, _ = self.lstm1(x)
        hid    = self.relu(hid)
        hid, _ = self.lstm2(hid)
        out    = self.relu(hid)
        out    = self.fc(out[:,-1,:])
        return out

## Train

In [34]:
from lib.EarlyStopping import EarlyStopping

inverse_transform_function = np.exp

In [35]:
import time

def train(model, optimizer, train_loader, valid_loader, scheduler, device, early_stopping, epochs, metric_period=1):
    
    es = EarlyStopping(patience = CFG['PATIENCE'], verbose = False, path='./model/checkpoint.pt')
    
    model.to(device)
    # criterion = nn.L1Loss().to(device)
    criterion = nn.MSELoss().to(device)

    best_loss = 9999
    best_model = None
    start_time = time.time()
    epoch_s = time.time()
    for epoch in range(1, epochs+1):
        
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):
            X = X.float().to(device)
            Y = Y.float().to(device)

            optimizer.zero_grad()

            output = model(X).float()
            
            # # log -> exp
            # output = torch.exp(output)
            # Y      = torch.exp(Y)
            
            loss = criterion(output, Y)
            loss = torch.sqrt(loss) # MSE -> RMSE

            # Getting gradients
            loss.backward()

            # Updating parameters
            optimizer.step()

            train_loss.append(loss.item())

        valid_loss = validation(model, valid_loader, criterion, device)

        if epoch % metric_period == 0:
            epoch_e = time.time()
            epoch_str = '0'*(len(str(epochs))-len(str(epoch))) + str(epoch)
            #print(f'[{epoch}/{epochs}] Train Loss : [{np.mean(train_loss):.5f}], Valid Loss : [{valid_loss:.5f}], elapsed : [{epoch_e-epoch_s:.2f}s]')
            progress = '[{}/{}] tr_loss : {:.5f}, val_loss : {:.5f}, elapsed : {:.2f}s, total : {:.2f}s, remaining : {:.2f}s'\
                .format(
                    epoch_str,
                    epochs,np.mean(train_loss),
                    valid_loss,
                    epoch_e-epoch_s,
                    epoch_e-start_time,
                    (epoch_e-epoch_s)*(epochs-epoch)
                )
            print(progress)
            epoch_s = time.time()
            
        if scheduler is not None:
            scheduler.step(valid_loss)

        if best_loss > valid_loss:
            best_loss = valid_loss
            best_model = model

        # early stopping 여부를 체크. 현재 과적합 상황 추적
        if early_stopping:
            es(valid_loss, model)

            if es.early_stop:
                break

    return best_model

In [36]:
def validation(model, valid_loader, criterion, device):
    model.eval()
    valid_loss = []
    with torch.no_grad():
        for X, Y in iter(valid_loader):
            X = X.float().to(device)
            Y = Y.float().to(device)

            output = model(X).float()
            
            # # log -> exp
            # model_pred = torch.exp(model_pred)
            # Y          = torch.exp(Y)
            
            # print(output[:5],Y[:5])
            loss = criterion(output, Y)
            loss = torch.sqrt(loss) # MSE -> RMSE

            valid_loss.append(loss.item())

    return np.mean(valid_loss)

## Run!!

In [39]:
input_size = X_train.shape[1]
model = BaseModel(input_size = input_size, hidden_sizes=[32,16],dropout_rates=[0.2,0.2], num_classes=1, num_layers=1)
print(model)

BaseModel(
  (lstm1): LSTM(15, 32, batch_first=True, dropout=0.2, bidirectional=True)
  (lstm2): LSTM(64, 16, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)


In [40]:
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, threshold_mode='abs',min_lr=1e-8, verbose=True)

best_model = train(
    model,
    optimizer,
    train_loader,
    val_loader,
    scheduler,
    device,
    early_stopping=False,
    metric_period=10,
    epochs=1024,
)

[0010/1024] tr_loss : 30.27542, val_loss : 82.26220, elapsed : 4.79s, total : 4.79s, remaining : 4853.61s
[0020/1024] tr_loss : 29.98543, val_loss : 81.87723, elapsed : 4.93s, total : 9.71s, remaining : 4945.00s
[0030/1024] tr_loss : 29.56663, val_loss : 81.33752, elapsed : 4.92s, total : 14.64s, remaining : 4894.01s
[0040/1024] tr_loss : 29.14476, val_loss : 80.78839, elapsed : 4.89s, total : 19.53s, remaining : 4812.29s
[0050/1024] tr_loss : 28.80274, val_loss : 80.32913, elapsed : 4.88s, total : 24.41s, remaining : 4754.44s
[0060/1024] tr_loss : 28.52729, val_loss : 79.94487, elapsed : 4.94s, total : 29.35s, remaining : 4761.56s
[0070/1024] tr_loss : 28.29657, val_loss : 79.61050, elapsed : 4.86s, total : 34.21s, remaining : 4634.26s
[0080/1024] tr_loss : 28.09639, val_loss : 79.30959, elapsed : 4.89s, total : 39.10s, remaining : 4614.90s
[0090/1024] tr_loss : 27.91768, val_loss : 79.03111, elapsed : 4.93s, total : 44.03s, remaining : 4607.74s
[0100/1024] tr_loss : 27.75550, val_los

## Inference

In [None]:
all_input_list

In [None]:
model.to(device)
model.eval()
pred_list = []
with torch.no_grad():
    for X in iter(test_loader):
        X = X.float().to(device)

        model_pred = model(X)
        model_pred = model_pred.cpu().numpy().reshape(-1).tolist()
        pred_list += model_pred

In [None]:
pred_list

In [None]:
test_input_list = sorted(glob.glob('./test_input/*.csv'))
test_target_list = sorted(glob.glob('./out/test_target/*.csv'))

In [None]:
def inference_per_case(model, test_loader, test_path, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for X in iter(test_loader):
            X = X.float().to(device)
            
            model_pred = model(X)
            
            model_pred = model_pred.cpu().numpy().reshape(-1).tolist()
            
            pred_list += model_pred
    
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = pred_list
    submit_df.to_csv(test_path, index=False)

In [None]:
for test_input_path, test_target_path in zip(test_input_list, test_target_list):
    test_dataset = CustomDataset([test_input_path], [test_target_path], True)
    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
    inference_per_case(best_model, test_loader, test_target_path, device)

## Submission

In [None]:
import zipfile
os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission.zip", 'w')
for path in test_target_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()