# 0. 라이브러리

In [1]:
%%capture
! pip install colorama
! pip install --upgrade lightgbm --install-option=--gpu

In [2]:
%%capture

import os
import random
import warnings
import datetime
import gc; gc.collect()
import pickle
from tqdm.notebook import tqdm

import pandas as pd
pd.set_option('mode.chained_assignment',  None)
import numpy as np 
from glob import glob

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from colorama import Fore, Back, Style

import plotly.express as px

ROOT = "/content/drive/"
from google.colab import drive; drive.mount(ROOT)

PATH = f"{ROOT}/MyDrive/Dacon/GrowRegression_32th/"
%cd $PATH

# 1. Define Functions

## Set Seed

In [3]:
# SEED 고정
def setSeed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

## Metric

In [4]:
# ====================================================
# Competition Metric
# ====================================================
def customRMSE(y_pred, y_true):
    return 100*np.sqrt(((y_pred - y_true) ** 2).mean())

# ====================================================
# Competition Metric
# ====================================================
def lgb_customRMSE(y_pred, y_true):
    y_true = y_true.get_label()
    return 'grow_metric', customRMSE(y_true, y_pred), False

# 2. Train Model

## File Path

In [5]:
trainInputList = sorted(glob("data/data_kfold/input/train*"))
trainLabelList = sorted(glob("data/data_kfold/label/train*"))
trainList = list(zip(trainInputList, trainLabelList))

validInputList = sorted(glob("data/data_kfold/input/valid*"))
validLabelList = sorted(glob("data/data_kfold/label/valid*"))
validList = list(zip(validInputList, validLabelList))

del trainInputList, trainLabelList, validInputList, validLabelList; gc.collect()


testPathList = sorted(glob("data/data_agg/test*"))
submissions = sorted(glob("submission/sample_submission/*"))

## Parameters

In [8]:
DEBUG = False
SEED=42; setSeed(SEED)

LabelMultiplier = 300

params = {
    'seed':SEED,
    'device_type' : 'gpu',
    'boosting' : 'gbdt',
    'objective': 'regression',
    'num_iterations' : 100 if DEBUG else 1000000, 
    'learning_rate': 5e-3, # 더 나은 모델은 5e-6은정도의 learning rate를 부여해야할 것으로 생각됨.
    'feature_fraction': 0.1,
    'bagging_seed' : SEED,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'num_leaves' : 80,
    'min_data_in_leaf' : 30,
    'min_sum_hessian_in_leaf' : 1e-3,
    'max_depth' : -1,
    'max_bin' : 128,
    'gpu_use_dp': True,
    'verbosity' : -1,
    'n_jobs':-1
}

## Run 

In [9]:
# OUTPUTS
Output = dict()

# Train with k-fold
for k in tqdm(range(len(trainList))):
    
    x_train, train_label = trainList[k]
    x_valid, valid_label = validList[k]

    
    print("*"*50)
    print(f"{Fore.GREEN}{Style.BRIGHT} FOLD {k+1} STRAT {Style.RESET_ALL}")
    print("*"*50)
    # 데이터 불러오기
    x_train = pd.read_parquet(x_train)
    y_train = x_train[['ID', 'rate']]
    y_train['rate'] = y_train['rate'] * LabelMultiplier

    x_valid = pd.read_parquet(x_valid)
    y_valid = x_valid[['ID', 'rate']]
    y_valid['rate'] = y_valid['rate'] * LabelMultiplier

    train_label = pd.read_parquet(train_label)
    valid_label = pd.read_parquet(valid_label)
    
    # Feature 
    feature = [x for x in x_train.columns if x not in ['ID', 'rate']]

    # 각 폴드 당 n_fold * trainID  만큼의 모델이 생성됨
    trainID = sorted(x_train['ID'].unique())
    validID = sorted(y_valid['ID'].unique())
    # Best Model 마다 Output 저장
    testOutput = []
    
    # Train 
    for e_1, _trainID in enumerate(trainID):

        print(f"{Fore.GREEN}{Style.BRIGHT} {'*'*5}Train Data {e_1+1} / {len(trainID)} STRAT{'*'*5} {Style.RESET_ALL}")
       
        # LightGBM Train 데이터
        INPUT_TRAIN = lgb.Dataset(x_train[x_train['ID']==_trainID][feature], 
                                label = y_train[y_train['ID']==_trainID]['rate'],
                                )
        _validScore = 1e10
        for e_2, _validID in enumerate(validID):
            print(f"{Fore.GREEN}{Style.BRIGHT} Valid Data {e_2+1} / {len(validID)} STRAT {Style.RESET_ALL}")
            # LightGBM Valid 데이터
            INPUT_VALID = lgb.Dataset(x_valid[x_valid['ID']==_validID][feature], 
                                    label = y_valid[y_valid['ID']==_validID]['rate'],
                                    reference=INPUT_TRAIN)
    
            # 모델 생성 및 학습
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=UserWarning)
            
                model = lgb.train(params,
                    train_set = INPUT_TRAIN,
                    valid_sets = [INPUT_TRAIN, INPUT_VALID],
                    valid_names = ['train','valid'],
                    feval = lgb_customRMSE,
                    callbacks = [early_stopping(500, verbose=False)]
                    )

            # validation loss
            y_val_pred = model.predict(x_valid[x_valid['ID']==_validID][feature])
            _y_valid = y_valid[y_valid['ID'] == _validID]
            _y_valid['rate'] = _y_valid['rate'] / LabelMultiplier
            _y_valid['pred'] = y_val_pred / LabelMultiplier

            pred = _y_valid[_y_valid['ID']==_validID].drop('ID', axis=1)
            pred = pred.resample('D').sum()
            true = valid_label[valid_label['ID']==_validID].drop('ID', axis=1)
            
            customMetric = customRMSE(pred['pred'].values, true['rate'].values)

            if _validScore > customMetric:
                _validScore = customMetric
                kth_model = (_trainID, _validID, model)

                print(f"{Fore.RED}{Style.BRIGHT} {'*'*5} Best Model Updated {'*'*5} {Style.RESET_ALL}")
                print(f"{Fore.RED}{Style.BRIGHT} Validation Loss :: {_validScore}{Style.RESET_ALL}")

        # test prediction
        for i, testPath in enumerate(testPathList):
            testInput = pd.read_parquet(testPath)
            _trainID, _validID, model = kth_model
            testPred = model.predict(testInput[feature])
            testPred = pd.DataFrame([testInput.index, testPred]).T
            testPred.columns = ['time', 'pred']
            testPred = testPred.set_index('time')
            testPred = testPred.resample('D').sum()
            testPred = testPred / LabelMultiplier

            testOutput.append((i+1, _trainID, _validID, _validScore, testPred.values))
        
    Output[k+1] = testOutput

  0%|          | 0/5 [00:00<?, ?it/s]

**************************************************
[32m[1m FOLD 1 STRAT [0m
**************************************************
[32m[1m *****Train Data 1 / 47 STRAT***** [0m
[32m[1m Valid Data 1 / 11 STRAT [0m
[31m[1m ***** Best Model Updated ***** [0m
[31m[1m Validation Loss :: 30.87530376570826[0m
[32m[1m Valid Data 2 / 11 STRAT [0m
[32m[1m Valid Data 3 / 11 STRAT [0m
[31m[1m ***** Best Model Updated ***** [0m
[31m[1m Validation Loss :: 22.299874242005426[0m
[32m[1m Valid Data 4 / 11 STRAT [0m


KeyboardInterrupt: ignored

## Save Output

In [None]:
with open("output/LightGBM.pkl", "wb") as f:
    pickle.dump(Output, f)

# Ensemble

In [None]:
submissionList = sorted(glob("submission/sample_submission/*"))

In [None]:
with open("output/LightGBM.pkl", "rb") as f:
    Output = pickle.load(f)

In [None]:
result = pd.DataFrame()
for fold, pred in Output.items():
    kth_fold = pd.DataFrame()
    for pred_case in pred:
        kth_fold = pd.concat([kth_fold, pd.DataFrame(pred_case).T])
    
    kth_fold.columns = ['testCase', 'trainID', 'validID', 'validScore', 'pred']
    kth_fold.insert(0, 'fold', [fold] * len(kth_fold))
    result = pd.concat([result, kth_fold])
    
    result['predLength'] = result['pred'].apply(lambda x: len(x))
    
del kth_fold; gc.collect()
result = result.reset_index(drop=True)

In [None]:
for case, submissionPath in zip(sorted(result['testCase'].unique()), submissionList):
    submission = pd.read_csv(submissionPath)
    testCase = result[(result['testCase'] == case)]

    testCase['validRank'] = testCase['validScore'].rank(method='average')
    testCase = testCase[testCase['validRank'] < 4]

    testCase['weight'] = 1 - (testCase['validScore'] / 100)
    testCase['weightedPred'] = testCase['weight'] * testCase['pred']

    length = testCase['predLength'].iloc[0]
    testPred = np.zeros(shape=length)
    sumWeight = np.sum(testCase['weight'])
    for v in testCase['weightedPred'].values:
        testPred += v.flatten()
    
    if len(submission) < len(testPred):
        testPred = testPred[len(testPred)-len(submission):]
    
    submission['rate'] = testPred / sumWeight
    submission.to_csv(submissionPath.replace('sample_submission', 'lightgbm'))