# Stacking

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from tabpfn import TabPFNRegressor
import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import argparse
parser = argparse.ArgumentParser(description="stacking_tabpfn")
parser.add_argument('--scaler', default="standard", type=str) 
parser.add_argument('--cv', default=10, type=int)
parser.add_argument('--seed', default=4, type=int)
args = parser.parse_args('')

scaler = args.scaler
cv = args.cv
seed = args.seed
if scaler == "standard":
    scaler = StandardScaler()
elif scaler == "minmax":
    scaler = MinMaxScaler()
elif scaler == "robust":
    scaler = RobustScaler()

def set_seeds(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()
idx = f"{parser.description}"
idx

'stacking_tabpfn'

## Preprocessing

In [2]:
KI_train = pd.read_csv('./data/x_full_trainV2.csv')
KI_test = pd.read_csv('./data/x_testV2.csv')

KI_train['건물번호'] = KI_train['building_number']
KI_test['건물번호'] = KI_test['building_number']

KI_train['일시'] = KI_train['date_time']
KI_test['일시'] = KI_test['date_time']

KI_train['일시'] = pd.to_datetime(KI_train['일시'], format='%Y-%m-%d %H:%M:%S')
KI_test['일시'] = pd.to_datetime(KI_test['일시'], format='%Y-%m-%d %H:%M:%S')

KI_train['전력소비량(kWh)'] = KI_train['power_consumption']

drop_columns = ['building_type', 'total_area', 'cooling_area', 'date']

KI_train = KI_train.drop(drop_columns, axis=1)
KI_test = KI_test.drop(drop_columns, axis=1)

train = KI_train.drop(['building_number', 'date_time', 'power_consumption'], axis=1)
test = KI_test.drop(['building_number', 'date_time', 'power_consumption'], axis=1)

train.columns

Index(['temperature', 'rainfall', 'windspeed', 'humidity', 'day', 'month',
       'is_weekend', 'weekday', 'holiday', 'temperature_squared',
       'humidity_squared', 'summer_cos', 'summer_sin', 'sin_hour', 'cos_hour',
       'sin_doy', 'cos_doy', 'CDH', 'CDD', 'THI', 'WCT', 'avg_temp',
       'max_temp', 'min_temp', 'temp_diff', 'avg_humid', 'max_humid',
       'min_humid', 'humid_diff', 'dow_hour_mean', 'dow_hour_std',
       'holiday_mean', 'holiday_std', 'hour_mean', 'hour_std',
       'month_hour_mean', 'month_hour_std', 'power_week_slope6h', '건물번호', '일시',
       '전력소비량(kWh)'],
      dtype='object')

## Modelling

In [3]:
tabpfn = TabPFNRegressor(random_state=seed, n_jobs=-1)
def get_stacking_ml_datasets(model, X_train_n, y_train_n, X_test_n, n_folds, fitting=True):
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]

        if fitting == True:
            model.fit(X_tr, y_tr)
            
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    return train_fold_pred, test_pred_mean

### Metrics

In [4]:
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    score = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))
    score = np.mean(score) * 100
    return score

## Training

In [5]:
best_ml = [tabpfn]

In [6]:
preds_total = []
for b_num in tqdm(train['건물번호'].unique()):
    
    train_df = train[train["건물번호"]==b_num]
    test_df = test[test["건물번호"]==b_num]

    X_train = train_df.drop(['건물번호', '일시', '전력소비량(kWh)'], axis=1)
    y_train = train_df['전력소비량(kWh)'].values

    X_test = test_df.drop(['건물번호', '일시'], axis=1)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    meta_X_train=[]
    meta_X_test=[]
    
    for idx, estimator in enumerate(best_ml):
        
        temp_X_train, temp_X_test = get_stacking_ml_datasets(
            estimator, X_train, y_train, X_test, cv
        )
        
        meta_X_train.append(temp_X_train)
        meta_X_test.append(temp_X_test)
        
    meta_X_train = np.hstack(meta_X_train)
    meta_X_test = np.hstack(meta_X_test)

    meta_clf = LinearRegression()
    meta_clf.fit(meta_X_train, y_train)
    preds_partial = meta_clf.predict(meta_X_test)
    
    preds_total.append(preds_partial)

prediction = np.hstack(preds_total)

prediction.shape

100%|██████████| 100/100 [45:55<00:00, 27.55s/it]


(16800,)

### Submission

In [7]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = np.round(prediction, 2)
submission.to_csv(f'{parser.description}_{args.seed}.csv', index=False)
submission.head()

Unnamed: 0,num_date_time,answer
0,1_20240825 00,3937.53
1,1_20240825 01,3639.63
2,1_20240825 02,3511.07
3,1_20240825 03,3025.68
4,1_20240825 04,2754.15
