# Import

In [1]:
import random
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
import numpy as np
import os
import glob

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fp = fm.FontProperties(fname='/home/studio-lab-user/Dacon/tools/NanumFont/NanumGothic.ttf', size=10)
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device :',device)

device : cpu


<br></br>

# Setting

<br>

## Hyperparameter Setting

In [3]:
CFG = {
    'EPOCHS':10,#1024,
    'PATIENCE':30,
    'LEARNING_RATE':0.05,
    'BATCH_SIZE':16,
    'SEED':42,
}

<br>

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

<br></br>

# Data Pre-processing

In [5]:
all_input_list  = sorted(glob.glob('./data/train_input/*.csv'))
all_target_list = sorted(glob.glob('./data/train_target/*.csv'))
all_test_list   = sorted(glob.glob('./data/test_input/*.csv'))

In [6]:
import os
from sklearn.preprocessing import MinMaxScaler

class Preprocess:
    def __init__(self, input_paths, label_paths, test_paths):
        self.input_paths = input_paths
        self.label_paths = label_paths
        self.test_paths  = test_paths
        
        self.input, self.label, self.test = None, None, None
        
        self.X_train, self.X_valid = None, None
        self.y_train, self.y_valid = None, None
        self.X, self.y = None, None

        input_fn = []
        label_fn = []
        for input_path, label_path in zip(self.input_paths, self.label_paths):
            case_num = input_path.replace('./data/train_input/CASE_','').replace('.csv','')
            
            input_df = pd.read_csv(input_path)
            label_df = pd.read_csv(label_path)

            input_df = input_df.fillna(0)

            input_df['case_num'] = case_num
            label_df['case_num'] = case_num
            
            input_fn.append(input_df)
            label_fn.append(label_df)
        
        test_fn = []
        for test_path in self.test_paths:
            test_df = pd.read_csv(test_path)
            case_num = test_path.replace('./data/test_input/TEST_','').replace('.csv','')
            
            test_df['case_num'] = case_num
            
            test_fn.append(test_df)
            
        self.input = pd.concat(input_fn,axis=0).sort_values(['case_num','DAT','obs_time'])
        self.label = pd.concat(label_fn,axis=0)
        self.test  = pd.concat(test_fn ,axis=0)
        
    def _data_return(self):
        return self.input,self.label,self.test
    
    def _preprocess(self):
        self.input['time'] = [i+1 for i in range(28*24)]*self.input.case_num.nunique()
        self.test ['time'] = [i+1 for i in range(28*24)]*self.test .case_num.nunique()
        # input_df.groupby(['case_num'])['time'].max()
        
        # self.input.drop(columns=['DAT','obs_time'],inplace=True)
        # self.test .drop(columns=['DAT','obs_time'],inplace=True)
        
    # https://dacon.io/competitions/official/236033/talkboard/407304?page=1&dtype=recent
    def _scale_dataset(self):
        
        minmax_info = {
            'time':[0,28*24],
            '내부온도관측치':[4,40],
            '내부습도관측치':[0,100],
            'co2관측치':[0,1200],
            'ec관측치':[0,8],
            '시간당분무량':[0,3000],
            '일간누적분무량':[0,72000],
            '시간당백색광량':[0,120000],
            '일간누적백색광량':[0,2880000],
            '시간당적색광량':[0,120000],
            '일간누적적색광량':[0,2880000],
            '시간당청색광량':[0,120000],
            '일간누적청색광량':[0,2880000],
            '시간당총광량':[0,120000],
            '일간누적총광량':[0,2880000],
        }
        scale_feature = [feature for feature,(min_info,max_info) in minmax_info.items()]
        
        # for train dataset
        for col in scale_feature:
            min_info,max_info = minmax_info[col]
            self.input[col] = (self.input[col]-min_info) / (max_info-min_info)
            
        # for test dataset
        for col in scale_feature:
            min_info,max_info = minmax_info[col]
            self.test[col] = (self.test[col]-min_info) / (max_info-min_info)
        
    def _interaction_term(self):
        num_features = self.input.select_dtypes(exclude=[object]).columns
        num_features = list(set(num_features)-set(['DAT','obs_time']))
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    self.input[f'{num_features[i]}*{num_features[j]}'] = self.input[num_features[i]]*self.input[num_features[j]]
                    self.test [f'{num_features[i]}*{num_features[j]}'] = self.test [num_features[i]]*self.test [num_features[j]]

    def _train_test_split(self, val_rate):
        val_size = int(self.input.DAT.nunique() * val_rate)

        tr_idx = self.input.DAT <  max(self.input.DAT)-val_size
        va_idx = self.input.DAT >= max(self.input.DAT)-val_size
        
        self.X_train = self.input[tr_idx]
        self.X_valid = self.input[va_idx]
        self.y_train = self.label[self.label.DAT.isin(self.X_train.DAT.unique()+1)]
        self.y_valid = self.label[self.label.DAT.isin(self.X_valid.DAT.unique()+1)]

        print(f'val_rate={val_rate}')
        print(f'train DAT : [{self.X_train.DAT.min()}~{self.X_train.DAT.max()}], validation DAT : [{self.X_valid.DAT.min()}~{self.X_valid.DAT.max()}]')
    
    def _return_train_range(self):
        return self.X_train.DAT.max()

    def _transform_dataset(self):
        self.y_train['predicted_weight_g'] = np.log(self.y_train['predicted_weight_g'])
        self.y_valid['predicted_weight_g'] = np.log(self.y_valid['predicted_weight_g'])
    
    def _save(self):
        self.X = pd.concat([self.X_train,self.X_valid],axis=0)
        self.y = pd.concat([self.y_train,self.y_valid],axis=0)
        
        mkdir_paths = ['./out/train_input','./out/train_target','./out/test_input']
        for path in mkdir_paths:
            if not os.path.exists(path):
                os.mkdir(path)
        
        print('Save Start ...')
        for case_num in self.X.case_num.unique():
            _X = self.X[self.X.case_num==case_num].drop(['case_num','obs_time'],axis=1)
            _y = self.y[self.y.case_num==case_num].drop(['case_num'],axis=1)
            
            _X.to_csv(f'./out/train_input/CASE_{case_num}.csv',index=False)
            _y.to_csv(f'./out/train_target/CASE_{case_num}.csv',index=False)
            
        for case_num in self.test.case_num.unique():
            _X = self.test[self.test.case_num==case_num].drop('case_num',axis=1)
            
            _X.to_csv(f'./out/test_input/TEST_{case_num}.csv',index=False)
        print('Save Done ...')

In [7]:
# val_rate = 0.05

# dataset = Preprocess(
#     input_paths = all_input_list,
#     label_paths = all_target_list,
#     test_paths = all_test_list,
# )

# # preprocessing + scaling + interaction term
# dataset._preprocess()
# dataset._scale_dataset()
# dataset._interaction_term()

# # train/validation split
# dataset._train_test_split(val_rate=val_rate)
# train_max_dat = dataset._return_train_range()

# # target transform
# dataset._transform_dataset()

# # save dataset
# dataset._save()

In [8]:
# from scipy.stats import pearsonr

# val_rate = 0.05

# dataset = Preprocess(
#     input_paths = all_input_list,
#     label_paths = all_target_list,
#     test_paths = all_test_list,
# )

# dataset._preprocess()
# dataset._scale_dataset()
# input_df, label_df = dataset._data_return()

# for case_num in tqdm(sorted(input_df.case_num.unique())):

#     input = input_df[input_df.case_num==case_num].drop('case_num',axis=1)
#     label = label_df[label_df.case_num==case_num].drop('case_num',axis=1)

#     fig = plt.figure(figsize=(20,15))
#     nrow = 3
#     ncol = 5

#     iter = 0
#     total = len(input.columns)-3
#     for col in input.columns:
#         if col not in ['time','DAT','obs_time']:
#             iter+=1

#             y1 = input[col]
#             #y1 = (y1-y1.min())/(y1.max()-y1.min())

#             y2 = label['predicted_weight_g']
#             y2 = (y2-y2.min())/(y2.max()-y2.min())

#             y3 = input.groupby('DAT')[col].mean().values

#             corr, pvalue = pearsonr(y2,y3)

#             fig.add_subplot(ncol,nrow,iter)
#             sns.scatterplot(x=input.time  ,y=y1)
#             sns.scatterplot(x=label.DAT*24,y=y2,color='red')
#             sns.lineplot   (x=label.DAT*24,y=y3,color='blue',linestyle='--',alpha=0.7)
#             plt.ylabel('')

#             plt.title(f'{col}(corr={corr:.3f}(pvalue={pvalue:.3f}))',fontproperties=fp)


#     plt.tight_layout()
#     plt.savefig(f'./fig/{case_num}.png',dpi=100)
#     plt.close()

<br>

# Modeling

<br>

## 모델1

In [9]:
from scipy.stats import pearsonr

val_rate = 0.05

dataset = Preprocess(
    input_paths = all_input_list,
    label_paths = all_target_list,
    test_paths = all_test_list,
)

dataset._preprocess()
dataset._scale_dataset()
# dataset._interaction_term()

input_df, label_df, test_df = dataset._data_return()
input_df = input_df.drop(columns=['obs_time']).groupby(['case_num','DAT']).mean().reset_index()
test_df  = test_df .drop(columns=['obs_time']).groupby(['case_num','DAT']).mean().reset_index()

input_df = input_df.drop(columns=['case_num'])
test_df  = test_df .drop(columns=['case_num'])
label_df = label_df['predicted_weight_g']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(input_df,label_df,test_size=0.05,random_state=42)
X_train.shape, X_valid.shape

((548, 16), (236, 16))

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold

tr_preds = []
va_preds = []
te_preds = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx,va_idx in tqdm(kf.split(X_train),total=5):
    X_tr, y_tr = X_train.iloc[tr_idx,:], y_train.iloc[tr_idx]
    X_va, y_va = X_train.iloc[va_idx,:], y_train.iloc[va_idx]

    model = CatBoostRegressor(iterations=5000,metric_period=1000,verbose=1,random_state=42)
    model.fit(X_tr,y_tr,eval_set=[(X_va,y_va)])

    tr_pred = model.predict(X_train)
    va_pred = model.predict(X_valid)
    te_pred = model.predict(test_df)
    
    tr_preds.append(tr_pred)
    va_preds.append(va_pred)
    te_preds.append(te_pred)

In [None]:
from sklearn.metrics import mean_squared_error

tr_pred = np.mean(tr_preds,axis=0)
va_pred = np.mean(va_preds,axis=0)
te_pred = np.mean(te_preds,axis=0)

tr_score = np.sqrt(mean_squared_error(y_pred=tr_pred,y_true=y_train))
va_score = np.sqrt(mean_squared_error(y_pred=va_pred,y_true=y_valid))
print(tr_score,va_score)

In [None]:
i = 0
for case_num in ['01','02','03','04','05']:
    test_case_df = pd.read_csv(f'./data/test_target/TEST_{case_num}.csv')
    test_case_df['predicted_weight_g'] = te_pred[(i*28):(i*28+28)]
    test_case_df.to_csv(f'./data/test_target/TEST_{case_num}.csv',index=False)
    i+=1

In [None]:
import zipfile

test_target_list = sorted(glob.glob('/home/studio-lab-user/Dacon/6_상추생육환경생성/data/test_target/*.csv'))

os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission.zip", 'w')
for path in test_target_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()

<br>

## 모델2

In [16]:
from scipy.stats import pearsonr

val_rate = 0.05

dataset = Preprocess(
    input_paths = all_input_list,
    label_paths = all_target_list,
    test_paths = all_test_list,
)

dataset._preprocess()
dataset._scale_dataset()
dataset._interaction_term()

input_df, label_df, test_df = dataset._data_return()
input_df = input_df.drop(columns=['obs_time']).groupby(['case_num','DAT']).mean().reset_index()
test_df  = test_df .drop(columns=['obs_time']).groupby(['case_num','DAT']).mean().reset_index()

input_df = input_df.drop(columns=['case_num'])
test_df  = test_df .drop(columns=['case_num'])
label_df = label_df['predicted_weight_g']

In [18]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold

tr_preds = []
va_preds = []
te_preds = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx,va_idx in tqdm(kf.split(input_df),total=5):
    X_train, y_train = input_df.iloc[tr_idx,:], label_df.iloc[tr_idx]
    X_valid, y_valid = input_df.iloc[va_idx,:], label_df.iloc[va_idx]

    model = CatBoostRegressor(iterations=100,metric_period=1,verbose=1,random_state=42)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid,y_valid)],
        use_best_model=True,
    )

    tr_pred = model.predict(X_train)
    va_pred = model.predict(X_valid)
    te_pred = model.predict(test_df)
    
    tr_preds.append(tr_pred)
    va_preds.append(va_pred)
    te_preds.append(te_pred)

  0%|          | 0/5 [00:00<?, ?it/s]

Learning rate set to 0.192782
0:	learn: 36.3305174	test: 38.0455554	best: 38.0455554 (0)	total: 64ms	remaining: 6.34s
1:	learn: 31.8288525	test: 33.5061117	best: 33.5061117 (1)	total: 89.2ms	remaining: 4.37s
2:	learn: 28.1843258	test: 29.8735731	best: 29.8735731 (2)	total: 109ms	remaining: 3.54s
3:	learn: 25.0522341	test: 27.1465967	best: 27.1465967 (3)	total: 137ms	remaining: 3.29s
4:	learn: 22.2488264	test: 24.6233582	best: 24.6233582 (4)	total: 156ms	remaining: 2.97s
5:	learn: 20.0029828	test: 22.6876843	best: 22.6876843 (5)	total: 179ms	remaining: 2.8s
6:	learn: 18.1497606	test: 20.6771933	best: 20.6771933 (6)	total: 207ms	remaining: 2.75s
7:	learn: 16.3038726	test: 19.0371491	best: 19.0371491 (7)	total: 231ms	remaining: 2.66s
8:	learn: 14.8604544	test: 17.8324236	best: 17.8324236 (8)	total: 250ms	remaining: 2.52s
9:	learn: 13.5972639	test: 16.6960906	best: 16.6960906 (9)	total: 272ms	remaining: 2.44s
10:	learn: 12.5417277	test: 15.8976317	best: 15.8976317 (10)	total: 299ms	remaini

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import mean_squared_error

tr_pred = np.mean(tr_preds,axis=0)
va_pred = np.mean(va_preds,axis=0)
te_pred = np.mean(te_preds,axis=0)

tr_score = np.sqrt(mean_squared_error(y_pred=tr_pred,y_true=y_train))
va_score = np.sqrt(mean_squared_error(y_pred=va_pred,y_true=y_valid))
print(tr_score,va_score)

In [None]:
i = 0
for case_num in ['01','02','03','04','05']:
    test_case_df = pd.read_csv(f'./data/test_target/TEST_{case_num}.csv')
    test_case_df['predicted_weight_g'] = te_pred[(i*28):(i*28+28)]
    test_case_df.to_csv(f'./data/test_target/TEST_{case_num}.csv',index=False)
    i+=1

In [None]:
import zipfile

test_target_list = sorted(glob.glob('/home/studio-lab-user/Dacon/6_상추생육환경생성/data/test_target/*.csv'))

os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission.zip", 'w')
for path in test_target_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()