## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import glob

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'EPOCHS':50,
    'LEARNING_RATE':7.5e-2,
    'BATCH_SIZE':32,
    'SEED':1
}

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [5]:
all_input_list = sorted(glob.glob('./open/train_input/*.csv'))
all_target_list = sorted(glob.glob('./open/train_target/*.csv'))

In [6]:
train_input_list = all_input_list[:25]
train_target_list = all_target_list[:25]

val_input_list = all_input_list[25:]
val_target_list = all_target_list[25:]

In [7]:
pd.read_csv('./open/train_input/CASE_02.csv')

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,시간당적색광량,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량
0,0,00:00:00.,25.796666,60.550000,647.433333,2.924978,61.7,61.7,0,0,0.0000,0.0000,0.000,0.0000,0.0000,0.0000
1,0,01:00:00.,25.285000,62.153333,648.450000,2.922873,61.7,123.4,0,0,0.0000,0.0000,0.000,0.0000,0.0000,0.0000
2,0,02:00:00.,23.513333,63.510000,633.950000,2.890171,61.7,185.1,0,0,0.0000,0.0000,0.000,0.0000,0.0000,0.0000
3,0,03:00:00.,22.618333,64.745000,637.100000,2.953415,61.7,246.8,0,0,0.0000,0.0000,0.000,0.0000,0.0000,0.0000
4,0,04:00:00.,22.951667,62.293333,631.050000,2.968661,61.7,308.5,0,0,0.0000,0.0000,0.000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,27,18:59:59.,25.880000,74.249168,611.450000,0.000000,61.7,1974.4,0,0,1244.4096,58379.6892,1178.008,55264.5535,2422.4176,113644.2427
668,27,19:59:59.,25.928333,76.351667,651.483333,0.000000,61.7,2036.1,0,0,0.0000,58379.6892,0.000,55264.5535,0.0000,113644.2427
669,27,20:59:59.,25.981667,76.766667,682.100000,0.000000,61.7,2097.8,0,0,0.0000,58379.6892,0.000,55264.5535,0.0000,113644.2427
670,27,21:59:59.,25.758333,77.206665,675.300000,0.000000,61.7,2159.5,0,0,0.0000,58379.6892,0.000,55264.5535,0.0000,113644.2427


In [9]:
pd.read_csv('./open/test_input/TEST_02.csv')

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,시간당적색광량,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량
0,0,00:00,30.965000,38.948334,526.200000,1.001451,0.00,0.00,0.0000,0.000,0,0,0,0,0.0000,0.000
1,0,01:00,30.331666,40.225000,525.066667,1.009793,769.00,769.00,0.0000,0.000,0,0,0,0,0.0000,0.000
2,0,02:00,30.576667,39.998333,525.700000,1.014194,0.00,769.00,0.0000,0.000,0,0,0,0,0.0000,0.000
3,0,03:00,29.598333,41.511666,525.600000,1.025163,762.57,1531.57,0.0000,0.000,0,0,0,0,0.0000,0.000
4,0,04:00,29.981666,40.995000,529.050000,1.030688,0.00,1531.57,0.0000,0.000,0,0,0,0,0.0000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,27,19:00,26.960000,56.620000,542.616667,1.067239,769.00,11117.05,12.3764,209656.216,0,0,0,0,12.3764,209656.216
668,27,20:00,26.211667,57.241667,540.583333,1.074064,0.00,11117.05,0.0000,209656.216,0,0,0,0,0.0000,209656.216
669,27,21:00,25.368333,56.310000,530.800000,1.078533,769.00,11886.05,0.0000,209656.216,0,0,0,0,0.0000,209656.216
670,27,22:00,25.256667,57.343333,540.466667,1.078533,0.00,11886.05,0.0000,209656.216,0,0,0,0,0.0000,209656.216


## CustomDataset

In [9]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

class CustomDataset(Dataset):
    def __init__(self, input_paths, target_paths, infer_mode):
        self.input_paths = input_paths
        self.target_paths = target_paths
        self.infer_mode = infer_mode
        
        self.data_list = []
        self.label_list = []
        print('Data Pre-processing..')
        for input_path, target_path in tqdm(zip(self.input_paths, self.target_paths)):
            input_df = pd.read_csv(input_path)
            target_df = pd.read_csv(target_path)
            
            input_df['obs_time'] = [int(i[:2]) for i in input_df['obs_time']]
            input_df['내부온도관측치'] = input_df['내부온도관측치']
            # input_df = input_df.drop(columns=['obs_time','일간누적총광량','일간누적청색광량','일간누적적색광량'])
            input_df = input_df.fillna(method='ffill')
            
            input_length = int(len(input_df)/24)
            target_length = int(len(target_df))
            
            for idx in range(target_length):
                time_series = input_df[24*idx:24*(idx+1)].values
                self.data_list.append(torch.Tensor(time_series))
            for label in target_df["predicted_weight_g"]:
                self.label_list.append(label)
        print('Done.')
              
    def __getitem__(self, index):
        data = self.data_list[index]
        label = self.label_list[index]
        if self.infer_mode == False:
            return data, label
        else:
            return data
        
    def __len__(self):
        return len(self.data_list)

In [10]:
train_dataset = CustomDataset(train_input_list, train_target_list, False)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input_list, val_target_list, False)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.


## Model Define

In [11]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.lstm = nn.LSTM(input_size=16, hidden_size=256, batch_first=True, bidirectional=False)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(256, 1),

        )
        
    def forward(self, x):
        hidden, _ = self.lstm(x)
        output = self.classifier(hidden[:,-1,:])
        return output

## Train

In [12]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    
    best_loss = 9999
    best_model = None
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        val_loss = validation(model, val_loader, criterion, device)
        
        print(f'Train Loss : [{np.mean(train_loss):.5f}] Valid Loss : [{val_loss:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_loss)
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
    return best_model

In [13]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for X, Y in iter(val_loader):
            X = X.float().to(device)
            Y = Y.float().to(device)
            
            model_pred = model(X)
            loss = criterion(model_pred, Y)
            
            val_loss.append(loss.item())
            
    return np.mean(val_loss)

## Run!!

In [14]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, threshold_mode='abs',min_lr=1e-8, verbose=True)

best_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

Train Loss : [31.50489] Valid Loss : [19.66876]
Train Loss : [30.33398] Valid Loss : [19.55695]
Train Loss : [30.37569] Valid Loss : [19.56189]
Train Loss : [30.16750] Valid Loss : [21.33017]
Train Loss : [30.75647] Valid Loss : [21.18556]
Epoch 00005: reducing learning rate of group 0 to 3.7500e-02.
Train Loss : [30.56462] Valid Loss : [19.85438]
Train Loss : [30.31766] Valid Loss : [19.60807]
Train Loss : [30.38572] Valid Loss : [20.01679]
Epoch 00008: reducing learning rate of group 0 to 1.8750e-02.
Train Loss : [30.24760] Valid Loss : [19.69749]
Train Loss : [30.21447] Valid Loss : [19.82251]
Train Loss : [30.29012] Valid Loss : [19.74988]
Epoch 00011: reducing learning rate of group 0 to 9.3750e-03.
Train Loss : [30.31692] Valid Loss : [19.81378]
Train Loss : [30.26349] Valid Loss : [19.81974]
Train Loss : [30.29631] Valid Loss : [19.72217]
Epoch 00014: reducing learning rate of group 0 to 4.6875e-03.
Train Loss : [30.26474] Valid Loss : [19.85684]
Train Loss : [30.23770] Valid Lo

## Inference

In [15]:
test_input_list = sorted(glob.glob('./open/test_input/*.csv'))
test_target_list = sorted(glob.glob('./open/test_target/*.csv'))

In [16]:
def inference_per_case(model, test_loader, test_path, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for X in iter(test_loader):
            X = X.float().to(device)
            
            model_pred = model(X)
            
            model_pred = model_pred.cpu().numpy().reshape(-1).tolist()
            pred_list += model_pred
    
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = pred_list
    submit_df.to_csv(test_path, index=False)

In [17]:
for test_input_path, test_target_path in zip(test_input_list, test_target_list):
    test_dataset = CustomDataset([test_input_path], [test_target_path], True)
    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
    inference_per_case(best_model, test_loader, test_target_path, device)

Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.


## Submission

In [18]:
import zipfile
os.chdir("./open/test_target/")
submission = zipfile.ZipFile("../submission.zip", 'w')
for path in test_target_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()