In [None]:
!pip install feature_engine
!pip install optuna

Collecting feature_engine
  Downloading feature_engine-1.6.1-py2.py3-none-any.whl (326 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/326.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m194.6/326.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.6.1
Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit, KFold
from feature_engine.encoding import OneHotEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.pipeline import Pipeline, make_pipeline
import optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# DATA

In [None]:
#NEW DATA
df = pd.read_csv("/content/drive/MyDrive/Datasets/데이콘/power_usage/new_df.csv")

# PREPROCESSING

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder

# Outlier 처리
def remove_outliers(df):
    cond = (df['강수량(mm)'] > 20) | (df['풍속(m/s)'] > 10)
    return df.drop(df[cond].index)

# 날짜 관련 처리
def process_date(df):
    df['주말'] = 0
    df.loc[(df['일주일'] == 5) | (df['일주일'] == 6), '주말'] = 1
    return df

# 시간대 처리
def bin_time(df):
    df['시간_binned'] = df['시간'].apply(lambda x : '아침' if 0<x<8 else "점심" if 8 <= x < 18 else "저녁")
    return df

# 기온 관련 처리
def process_temperature(df):
    df['화씨기온(F)'] = (df['기온(C)'] * 9/5) + 32
    df['이동평균기온'] = df['기온(C)'].rolling(window=24).mean()
    return df

# 기타 계산
def calculate_extra_features(df):
    df['기온_습도_interaction'] = df['기온(C)'] * df['습도(%)']
    df['기온_풍속_interaction'] = df['기온(C)'] * df['풍속(m/s)']
    df['기온_squared'] = df['기온(C)'] ** 2
    df['습도_squared'] = df['습도(%)'] ** 2
    df['기온_lag_1h'] = df['기온(C)'].shift(1)
    return df

# 결측치 처리
def handle_missing_values(df):
    df['강수량(mm)'].fillna(0, inplace=True)
    df['풍속(m/s)'].fillna(df['풍속(m/s)'].median(), inplace=True)
    df['습도(%)'].fillna(df['습도(%)'].median(), inplace=True)
    df['태양광용량(kW)'].fillna(0, inplace=True)
    df['ESS저장용량(kWh)'].fillna(0, inplace=True)
    df['PCS용량(kW)'].fillna(0, inplace=True)
    return df

# One Hot Encoding
def one_hot_encoding(df):
    df['건물번호'] = df['건물번호'].astype('object')
    encoder = OneHotEncoder(variables=['건물유형','건물번호','시간_binned'])
    return encoder.fit_transform(df)

# 메인 함수
def main():
    df = pd.read_csv("/content/drive/MyDrive/Datasets/데이콘/power_usage/new_df.csv")

    df = remove_outliers(df)
    df = process_date(df)
    df = bin_time(df)
    df = process_temperature(df)
    df = calculate_extra_features(df)
    df = handle_missing_values(df)
    df = df.drop(['일시'], 1)
    df = one_hot_encoding(df)

    return df
    # 이후 작업

if __name__ == "__main__":
    main()


In [None]:
df = main()

# Model

In [None]:
train = df[~df['전력소비량(kWh)'].isna()]
test = df[df['전력소비량(kWh)'].isna()].drop(['전력소비량(kWh)'],1)

X = train.drop(['전력소비량(kWh)'],1)
y = train['전력소비량(kWh)']

In [None]:
%%time
# SMAPE metriği
def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100)

target = '전력소비량(kWh)'
original_features = test.columns

result_list = []
def score_model(model, features_used, label=None):
    score_list = []
    oof = np.zeros_like(train[target])
    tscv = TimeSeriesSplit(n_splits=10)

    for fold, (train_index, test_index) in enumerate(tscv.split(train[original_features], train[target])):
        X_train,X_test = X.iloc[train_index], X.iloc[test_index]
        y_train,y_test = y.iloc[train_index], y.iloc[test_index]

        model = model
        model.fit(X_train, y_train)
        tscore = smape(y_train, model.predict(X_train))
        y_test_pred = model.predict(X_test)
        score = smape(y_test, y_test_pred)
        print(f" Fold {fold} : tscore = {tscore:.3f} score = {score:.3f}")
        oof[test_index] = y_test_pred
        score_list.append(score)

    score = sum(score_list) / len(score_list)
    print(f" Avg. smape score : {score:.3f}")
    if label is not None:
        global result_list
        result_list.append((label, score, oof))

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 12.6 µs


# XGB

## OPTUNA

In [None]:
import optuna
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit

def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100)

def objective_xgb(trial):
    """
    Objective function to tune a `XGBRegressor` model.
    """
    tscv = TimeSeriesSplit(n_splits=10)
    scores = []

    for fold, (train_index, test_index) in enumerate(tscv.split(train[original_features], train[target])):
        X_train, X_test = train[original_features].iloc[train_index], train[original_features].iloc[test_index]
        y_train, y_test = train[target].iloc[train_index], train[target].iloc[test_index]

        params = {
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_uniform('gamma', 0, 2),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
            'max_depth': trial.suggest_int('max_depth', 10, 20),
            'learning_rate': trial.suggest_float("learning_rate", 1e-8, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 2000, 3500),
            'subsample': trial.suggest_loguniform('subsample', 0.05, 1),
            'random_state': 42
        }

        model = XGBRegressor(**params, tree_method='gpu_hist')
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        fold_smape = smape(y_test, pred)
        scores.append(fold_smape)

    return sum(scores) / len(scores)


In [None]:
# study = optuna.create_study(direction="minimize")
# study.optimize(objective_xgb, n_trials=10)

In [None]:
xgb_params =  {'min_child_weight': 4, 'gamma': 0.3759792177779915, 'colsample_bytree': 0.7153976698720064, 'reg_alpha': 0.42305051144928957, 'reg_lambda': 1.1159900972673142, 'max_depth': 12, 'learning_rate': 0.014873233526991941,
           'n_estimators': 3500, 'subsample': 0.4053585436167561}

score_model(XGBRegressor(**xgb_params, tree_method='gpu_hist', random_state=42), features_used=test.columns, label="XGB")

 Fold 0 : tscore = 0.488 score = 43.242
 Fold 1 : tscore = 0.805 score = 60.814
 Fold 2 : tscore = 1.009 score = 64.335


# LGBM

In [None]:
score_model(LGBMRegressor(n_estimators = 1000, random_state=42, verbose=-1, learning_rate=0.099), features_used=test.columns, label="LGBM")

 Fold 0 : tscore = 3.062 score = 34.427
 Fold 1 : tscore = 3.843 score = 59.939
 Fold 2 : tscore = 4.094 score = 66.635
 Fold 3 : tscore = 3.764 score = 48.140
 Fold 4 : tscore = 4.650 score = 46.431
 Fold 5 : tscore = 4.883 score = 54.277
 Fold 6 : tscore = 4.874 score = 64.756
 Fold 7 : tscore = 5.149 score = 33.994
 Fold 8 : tscore = 5.327 score = 36.677
 Fold 9 : tscore = 5.652 score = 39.403
 Avg. smape score : 48.468


## OPTUNA

In [None]:
def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100)

def objective_lgbm(trial):
    """
    Objective function to tune a `XGBRegressor` model.
    """
    tscv = TimeSeriesSplit(n_splits=10)
    scores = []

    for fold, (train_index, test_index) in enumerate(tscv.split(train[original_features], train[target])):
        X_train, X_test = train[original_features].iloc[train_index], train[original_features].iloc[test_index]
        y_train, y_test = train[target].iloc[train_index], train[target].iloc[test_index]

        params = {
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
            'gamma': trial.suggest_uniform('gamma', 0, 2),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
            'max_depth': trial.suggest_int('max_depth', 8, 25),
            'learning_rate': trial.suggest_float("learning_rate", 1e-8, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 200, 4000),
            'subsample': trial.suggest_loguniform('subsample', 0.05, 1),
        }

        model = LGBMRegressor(**params,random_state=42, verbose=-1)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        fold_smape = smape(y_test, pred)
        scores.append(fold_smape)

    return sum(scores) / len(scores)

In [None]:
# %%time
# study = optuna.create_study(direction="minimize")
# study.optimize(objective_lgbm, n_trials=20)

In [None]:
lgbm_params = {'min_child_weight': 1, 'gamma': 1.3655561121466593, 'colsample_bytree': 0.91218315174515, 'reg_alpha': 0.5629425361195333, 'reg_lambda': 5.727596183557824, 'max_depth': -1, 'learning_rate': 0.048484317534138836, 'n_estimators': 3000, 'subsample': 0.352325899422433}
score_model(LGBMRegressor(random_state=42, **lgbm_params, verbose=-1), features_used=test.columns, label="LGBM")

# FINAL MODEL

# Single model

In [None]:
model1 = XGBRegressor(random_state=42,tree_method='gpu_hist', **xgb_params)
model1.fit(X,y)

In [None]:
model2 = LGBMRegressor(**lgbm_params, random_state=42, verbose=-1)
model2.fit(X,y)

## VOTING

In [None]:
vot_model = VotingRegressor(estimators=[
    ('xgb',model1),
    ('lgbm',model2),
])

model = vot_model.fit(X,y)

In [None]:
dir = "/content/drive/MyDrive/Datasets/데이콘/power_usage/open"
ss = pd.read_csv(dir + "/sample_submission.csv")
ss['answer'] = model1.predict(test)
ss['answer'] = ss['answer'].clip(0,)
ss['answer'].value_counts()

1100.166260    2
859.477051     2
2080.144775    2
1441.541016    2
1553.209595    1
              ..
3171.648682    1
3181.994629    1
3172.928223    1
3173.538330    1
568.382874     1
Name: answer, Length: 16796, dtype: int64

In [None]:
ss.to_csv("./xgb_lgbm_xgb_maxdepth_12_xgb_esti_3700_lr_lgbm_esti_3500.csv", index=False)