In [1]:
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import os
import glob
import random

import warnings
warnings.filterwarnings("ignore")

## Baseline

In [3]:
train = pd.read_csv('./EXAMPLE_DACON/train.csv')

In [4]:
train.tail()

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
52555,1094,21,30,0,0,2.4,70.7,-4,0.0
52556,1094,22,0,0,0,2.4,66.79,-4,0.0
52557,1094,22,30,0,0,2.2,66.78,-4,0.0
52558,1094,23,0,0,0,2.1,67.72,-4,0.0
52559,1094,23,30,0,0,2.1,67.7,-4,0.0


In [5]:
def preprocess_data(data, is_train=True):
    
    temp = data.copy()
    temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]

    if is_train==True:          
    
        temp['Target1'] = temp['TARGET'].shift(-48).fillna(method='ffill')
        temp['Target2'] = temp['TARGET'].shift(-48*2).fillna(method='ffill')
        temp = temp.dropna()
        
        return temp.iloc[:-96]

    elif is_train==False:
        
        temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]
                              
        return temp.iloc[-48:, :]


df_train = preprocess_data(train)
df_train.iloc[:48]

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T,Target1,Target2
0,0,0.0,0,0,1.5,69.08,-12,0.0,0.0
1,0,0.0,0,0,1.5,69.06,-12,0.0,0.0
2,1,0.0,0,0,1.6,71.78,-12,0.0,0.0
3,1,0.0,0,0,1.6,71.75,-12,0.0,0.0
4,2,0.0,0,0,1.6,75.2,-12,0.0,0.0
5,2,0.0,0,0,1.5,69.29,-11,0.0,0.0
6,3,0.0,0,0,1.5,72.56,-11,0.0,0.0
7,3,0.0,0,0,1.4,72.55,-11,0.0,0.0
8,4,0.0,0,0,1.3,74.62,-11,0.0,0.0
9,4,0.0,0,0,1.3,74.61,-11,0.0,0.0


In [6]:
train.iloc[48:96]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
48,1,0,0,0,0,1.6,90.66,-10,0.0
49,1,0,30,0,0,1.6,90.68,-10,0.0
50,1,1,0,0,0,1.6,88.11,-11,0.0
51,1,1,30,0,0,1.6,88.11,-11,0.0
52,1,2,0,0,0,1.6,90.85,-11,0.0
53,1,2,30,0,0,1.6,90.84,-11,0.0
54,1,3,0,0,0,1.7,93.78,-12,0.0
55,1,3,30,0,0,1.7,93.77,-12,0.0
56,1,4,0,0,0,1.7,90.46,-12,0.0
57,1,4,30,0,0,1.6,90.46,-12,0.0


In [7]:
train.iloc[48+48:96+48]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
96,2,0,0,0,0,2.2,73.8,-8,0.0
97,2,0,30,0,0,2.1,68.2,-8,0.0
98,2,1,0,0,0,2.1,69.06,-8,0.0
99,2,1,30,0,0,2.1,69.04,-8,0.0
100,2,2,0,0,0,2.2,69.38,-8,0.0
101,2,2,30,0,0,2.2,69.36,-7,0.0
102,2,3,0,0,0,2.2,70.03,-7,0.0
103,2,3,30,0,0,2.2,70.02,-7,0.0
104,2,4,0,0,0,2.3,72.18,-7,0.0
105,2,4,30,0,0,2.4,66.78,-7,0.0


In [8]:
df_train.tail()

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T,Target1,Target2
52459,21,0.0,0,0,3.5,55.97,-1,0.0,0.0
52460,22,0.0,0,0,3.9,54.23,-2,0.0,0.0
52461,22,0.0,0,0,4.1,54.21,-2,0.0,0.0
52462,23,0.0,0,0,4.3,56.46,-2,0.0,0.0
52463,23,0.0,0,0,4.1,56.44,-2,0.0,0.0


In [9]:
df_test = []

for i in range(41):
    file_path = './EXAMPLE_DACON/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    temp = preprocess_data(temp, is_train=False)
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(1968, 7)

In [10]:
X_test.head(48)

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T
288,0,0.0,0,0,0.8,80.92,-2.8
289,0,0.0,0,0,0.9,81.53,-2.9
290,1,0.0,0,0,1.0,79.91,-3.0
291,1,0.0,0,0,0.9,79.91,-3.0
292,2,0.0,0,0,0.9,77.2,-3.0
293,2,0.0,0,0,1.1,78.36,-3.2
294,3,0.0,0,0,1.2,76.78,-3.4
295,3,0.0,0,0,1.8,77.93,-3.6
296,4,0.0,0,0,2.4,76.89,-3.8
297,4,0.0,0,0,2.9,78.64,-4.1


In [11]:
df_train.head()

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T,Target1,Target2
0,0,0.0,0,0,1.5,69.08,-12,0.0,0.0
1,0,0.0,0,0,1.5,69.06,-12,0.0,0.0
2,1,0.0,0,0,1.6,71.78,-12,0.0,0.0
3,1,0.0,0,0,1.6,71.75,-12,0.0,0.0
4,2,0.0,0,0,1.6,75.2,-12,0.0,0.0


In [12]:
df_train.iloc[-48:]

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T,Target1,Target2
52416,0,0.0,0,0,3.4,50.47,-1,0.0,0.0
52417,0,0.0,0,0,3.4,50.45,0,0.0,0.0
52418,1,0.0,0,0,3.5,50.3,0,0.0,0.0
52419,1,0.0,0,0,3.5,50.3,0,0.0,0.0
52420,2,0.0,0,0,3.5,51.14,0,0.0,0.0
52421,2,0.0,0,0,3.5,51.12,0,0.0,0.0
52422,3,0.0,0,0,3.6,52.09,0,0.0,0.0
52423,3,0.0,0,0,3.7,48.39,0,0.0,0.0
52424,4,0.0,0,0,3.8,49.0,0,0.0,0.0
52425,4,0.0,0,0,3.9,49.0,0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], test_size=0.3, random_state=0)

In [14]:
X_train_1.head()

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T
32908,14,1.783051,19,0,2.3,63.64,3
10140,6,12.103848,45,473,1.8,66.74,19
32182,11,56.300682,77,929,2.2,39.56,13
20953,12,65.401188,310,525,1.5,23.71,15
36079,15,19.614206,70,536,0.5,46.63,0


In [15]:
X_test.head()

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T
288,0,0.0,0,0,0.8,80.92,-2.8
289,0,0.0,0,0,0.9,81.53,-2.9
290,1,0.0,0,0,1.0,79.91,-3.0
291,1,0.0,0,0,0.9,79.91,-3.0
292,2,0.0,0,0,0.9,77.2,-3.0


In [16]:
X_test.head()

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T
288,0,0.0,0,0,0.8,80.92,-2.8
289,0,0.0,0,0,0.9,81.53,-2.9
290,1,0.0,0,0,1.0,79.91,-3.0
291,1,0.0,0,0,0.9,79.91,-3.0
292,2,0.0,0,0,0.9,77.2,-3.0


In [17]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [18]:
from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7, force_row_wise=True )                
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)])

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [19]:
# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles:
        print(q)
        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)

    LGBM_actual_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred

In [20]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)
results_1.sort_index()[:48]

0.1
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.2
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.3
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.4
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.5
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.6
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
[LightGBM] [Info] Start training from score 9.194565
0.7
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
[LightGBM] [Info] Start training from

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)
results_2.sort_index()[:48]

0.1
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.2
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.3
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.4
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.5
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
0.6
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
[LightGBM] [Info] Start training from score 9.008802
0.7
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 36724, number of used features: 7
[LightGBM] [Info] Start training from

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
results_1.sort_index().iloc[:48]

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
## 모델 검증 코드 ( 중복 사용 / 예시로 참고 )

In [24]:
results_1

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...
1963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.33
1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.33
1965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.08
1966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.08


In [26]:
models = [*models_1, *models_2]
model_quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
X_test = pd.concat([X_valid_1, X_valid_2])
Y_test = pd.concat([Y_valid_1, Y_valid_2])

results = []

for model, quantile in list(zip(models, model_quantiles)):
    pred = model.predict(X_test)
    print (quantile, len(pred), len(Y_test))
    results

0.1 31480 31480
0.2 31480 31480
0.3 31480 31480
0.4 31480 31480
0.5 31480 31480
0.6 31480 31480
0.7 31480 31480
0.8 31480 31480
0.9 31480 31480
0.1 31480 31480
0.2 31480 31480
0.3 31480 31480
0.4 31480 31480
0.5 31480 31480
0.6 31480 31480
0.7 31480 31480
0.8 31480 31480
0.9 31480 31480


In [27]:
from sklearn.metrics import mean_squared_error, mean_pinball_loss
import numpy as np
import pandas as pd

def _predict_best(model, X):
    num_iter = getattr(model, "best_iteration_", None)
    return model.predict(X, num_iteration=num_iter)

def evaluate_models(models, X_valid, Y_valid, target_name):
    rows = []
    # 예측 샘플 저장용
    sample_out = []

    for i, model in enumerate(models):
        q = model.get_params().get('alpha', None)  # 분위수 복원
        y_pred = _predict_best(model, X_valid)     # 반올림 금지(정밀도 유지)
        rmse = np.sqrt(mean_squared_error(Y_valid, y_pred))
        qloss = mean_pinball_loss(Y_valid, y_pred, alpha=q) if q is not None else np.nan

        rows.append({
            "target": target_name,
            "quantile": q,
            "best_iter": getattr(model, "best_iteration_", None),
            "valid_rmse": rmse,
            "valid_quantile_loss": qloss
        })

        # 예측 샘플 (상위 3개)
        sample_out.append((target_name, q, y_pred[:3].tolist()))

    metrics_df = pd.DataFrame(rows).sort_values(["target", "quantile"]).reset_index(drop=True)

    # 마지막에 모델 예측 결과 샘플 출력
    print(f"=== [{target_name}] prediction samples (first 3) ===")
    for t, q, arr in sample_out:
        print(f"target={t}, q={q} -> {arr}")

    return metrics_df

# ───────────── 평가 실행 (타겟별로 분리 평가) ─────────────
metrics_1 = evaluate_models(models_1, X_valid_1, Y_valid_1, target_name="target1")
metrics_2 = evaluate_models(models_2, X_valid_2, Y_valid_2, target_name="target2")

# 합쳐서 보기
metrics_all = pd.concat([metrics_1, metrics_2], ignore_index=True)
print("\n=== Validation Metrics (by target & quantile) ===")
print(metrics_all)

# (선택) 정렬된 표 형태로 상위 일부 확인
display(metrics_all.sort_values(["target","quantile"]).head(20))

=== [target1] prediction samples (first 3) ===
target=target1, q=0.1 -> [0.0, 0.0, 28.629370328113115]
target=target1, q=0.2 -> [0.0, 0.0, 36.8808199978765]
target=target1, q=0.3 -> [0.0, 0.0, 46.241024982178594]
target=target1, q=0.4 -> [0.0, 0.0, 52.7615861481495]
target=target1, q=0.5 -> [0.0, 0.0, 64.34229635846232]
target=target1, q=0.6 -> [3.648303126912242e-34, 3.648303126912242e-34, 78.26591329037521]
target=target1, q=0.7 -> [3.685410684227883e-34, 3.685410684227883e-34, 76.62629939950838]
target=target1, q=0.8 -> [3.702640462849768e-34, 3.702640462849768e-34, 80.95060344347748]
target=target1, q=0.9 -> [3.609244118173879e-34, 3.609244118173879e-34, 88.76281595441039]
=== [target2] prediction samples (first 3) ===
target=target2, q=0.1 -> [0.0, 0.0, 23.94481646468797]
target=target2, q=0.2 -> [0.0, 0.0, 47.66725930255752]
target=target2, q=0.3 -> [0.0, 0.0, 49.256923804351814]
target=target2, q=0.4 -> [0.0, 0.0, 59.40771504964204]
target=target2, q=0.5 -> [0.0, 0.0, 58.3181275

Unnamed: 0,target,quantile,best_iter,valid_rmse,valid_quantile_loss
0,target1,0.1,0,17.237637,1.45617
1,target1,0.2,0,13.516711,2.163399
2,target1,0.3,0,11.854902,2.520742
3,target1,0.4,0,11.2189,2.623273
4,target1,0.5,0,11.221407,2.528018
5,target1,0.6,0,11.729128,2.288955
6,target1,0.7,0,12.560108,1.906309
7,target1,0.8,0,13.701551,1.405593
8,target1,0.9,0,15.187288,0.782083
9,target2,0.1,0,21.093042,1.377484


In [28]:
import numpy as np
import pandas as pd

def add_composite_score(metrics_all: pd.DataFrame, 
                        w_rmse: float = 0.5, 
                        w_qloss: float = 0.5,
                        groupby_col: str = "target"):
    """
    metrics_all: columns = ['target','quantile','valid_rmse','valid_quantile_loss', ...]
    반환: score, score_0_100 컬럼 추가된 DataFrame
    """
    df = metrics_all.copy()

    # 1) 타깃별 스케일 산출(중앙값; 0/NaN 방지)
    rmse_med = df.groupby(groupby_col)['valid_rmse'].transform('median').replace({0: np.nan})
    ql_med   = df.groupby(groupby_col)['valid_quantile_loss'].transform('median').replace({0: np.nan})

    # 2) 정규화 (값이 클수록 나쁨 → 1 이상이면 중앙값보다 나쁨)
    df['RMSE_norm'] = df['valid_rmse'] / rmse_med
    df['QLoss_norm'] = df['valid_quantile_loss'] / ql_med

    # 결측/무한치 방어
    for col in ['RMSE_norm', 'QLoss_norm']:
        df[col] = df[col].replace([np.inf, -np.inf], np.nan)
    df[['RMSE_norm','QLoss_norm']] = df[['RMSE_norm','QLoss_norm']].fillna(df[['RMSE_norm','QLoss_norm']].median())

    # 3) 가중 합 → 역수(높을수록 좋은 점수)
    denom = (w_rmse * df['RMSE_norm'] + w_qloss * df['QLoss_norm'])
    # 아주 드문 0 방지
    denom = denom.replace(0, denom[denom != 0].min())
    df['score'] = 1.0 / denom

    # 4) 보기 좋은 0~100 스케일(타깃별 Min-Max)
    def _mm(x):
        x_min, x_max = x.min(), x.max()
        return 100 * (x - x_min) / (x_max - x_min) if x_max > x_min else 100.0
    df['score_0_100'] = df.groupby(groupby_col)['score'].transform(_mm).round(2)

    # 5) 정렬 예시
    df = df.sort_values([groupby_col, 'score'], ascending=[True, False]).reset_index(drop=True)
    return df

# 사용 예시
metrics_all = add_composite_score(metrics_all, w_rmse=0.5, w_qloss=0.5)
score = np.mean(metrics_all['score'])


In [29]:
print(f"SCORE: {score}")

SCORE: 1.0243893896745009
