In [1]:
import sys
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("sktime version: {}".format(sktime.__version__))
print("xgboost version: {}".format(xgb.__version__))
print("seaborn version: {}".format(sns.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("------------------------------------------------------------------------------")

-------------------------- Python & library version --------------------------
Python version: 3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]
pandas version: 1.5.3
numpy version: 1.22.4
matplotlib version: 3.5.3
tqdm version: 4.64.0
sktime version: 0.13.2
xgboost version: 1.7.1
seaborn version: 0.11.2
scikit-learn version: 1.1.3
------------------------------------------------------------------------------


In [2]:
current_path = os.getcwd()
parent_path = os.path.abspath(os.path.join(current_path, '..', '..','..'))
sys.path.append(parent_path)
pd.set_option('display.max_columns', 30)
train = pd.read_csv('../../../../train.csv')
train.drop(['num_date_time'],axis=1,inplace=True)
test = pd.read_csv('../../../../test.csv')
test.drop(['num_date_time'],axis=1,inplace=True)
building = pd.read_csv('../../../../building_info.csv')
test = pd.read_csv('../../../../merge_test_encoding.csv', encoding = "CP949")
train_loc = pd.read_csv('../../../../train_location.csv')
train = pd.concat([train,train_loc['location']],axis=1)
def SMAPE(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output

In [3]:
from preprocessing import fillnan
from preprocessing import preprocessing_all

train, test = fillnan(train, test)
train, test = preprocessing_all(train, test, building)

## XGB

In [77]:
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [78]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def objective_xgb(trial: Trial, X_train, y_train, X_val, y_val):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 500, 5000),
        'max_depth': trial.suggest_int('max_depth', 8, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_int('gamma', 1, 3),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        #'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
        'alpha': trial.suggest_loguniform('alpha', 1, 100.0),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
        'random_state': 724
    }

    model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0, seed=724)
    alpha_value = model.get_params()['alpha']
    model.set_params(**{'objective':weighted_mse(alpha_value)})
    model.fit(X_train, y_train, verbose = False, eval_set=[(X_val, y_val)], early_stopping_rounds=50)
    y_pred = model.predict(X_val)
    score = SMAPE(y_val, y_pred)

    return score


### 34 / 5 따로 fit

In [79]:
# 3, 4
preds = np.array([]) 
best_params = {}
best_values = {}  # New dictionary to store the best values
with open('best_params_values_0724_34.txt', 'w') as f:
    # Select the data for the current num
    num_data = train.copy()

    # Combine 'hour', 'day', 'week' to a new feature as stratified target
    #num_data['stratified_target'] = num_data['hour'].astype(str) + '_' + num_data['day'].astype(str) + '_' + num_data['week'].astype(str) + '_' + num_data['month'].astype(str)
    num_data['stratified_target'] = num_data['hour'].astype(str) + '_' + num_data['day'].astype(str)

    # Split the data into training and validation set
    train_df, val_df = train_test_split(num_data, test_size=0.2, stratify=num_data['stratified_target'], random_state=724)

    # Drop the temporary feature
    train_df = train_df.drop(columns='stratified_target')
    val_df = val_df.drop(columns='stratified_target')

    # Validation set에서 day 값이 3, 4이 아닌 행을 training set에 추가
    train_df = pd.concat([train_df, val_df[~val_df['day'].isin([3, 4])]])

    # Validation set에서 day 값이 3, 4인 행만 남기기
    val_df = val_df[val_df['day'].isin([3, 4])]

    y_train = train_df['power']
    y_val = val_df['power']

    x_train, x_test = train_df.drop(['date_time'],axis=1), test.drop(['date_time'],axis=1)
    x_val = val_df.drop(['date_time'],axis=1)

    x_train.drop(['power'],axis=1,inplace=True)
    x_val.drop(['power'],axis=1,inplace=True)

    x_test = x_test[x_train.columns]

    study = optuna.create_study(direction='minimize', sampler=TPESampler())
    study.optimize(lambda trial: objective_xgb(trial, x_train, y_train, x_val, y_val), n_trials=60)
    param = study.best_trial.params
    best_params = param
    best_values = study.best_trial.value  # Store the best value
    f.write(f'Best Params: {best_params}, \nBest Values: {best_values}\n\n')
    f.flush()
    xgb = XGBRegressor(**param, tree_method='gpu_hist', gpu_id=0, seed=724)
    alpha_value2 = xgb.get_params()['alpha']
    xgb.set_params(**{'objective':weighted_mse(alpha_value2)})
    ##근데 저렇게 validation 하면 학습 셋도 좀 이상해지는데 이게 맞나?0.4
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    preds_34 = np.append(preds, y_pred)


[I 2023-07-24 18:39:10,376] A new study created in memory with name: no-name-572304af-00ac-4154-b446-352d1d451561
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1, 100.0),
[I 2023-07-24 18:41:31,021] Trial 0 finished with value: 3.6113088453639355 and parameters: {'n_estimators': 967, 'max_depth': 14, 'min_child_weight': 48, 'gamma': 2, 'learning_rate': 0.01, 'colsample_bytree': 0.9777535759338825, 'lambda': 1.0868376013446523, 'alpha': 4.689702568963343, 'subsample': 0.7}. Best is trial 0 with value: 3.6113088453639355.
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1, 100.0),
[I 2023-07-24 18:42:01,010] Trial 1 finished with value: 5.105624474367024 and parameters: {'n_estimators': 1391, 'max_depth': 10, 'min_child_weight': 254, 'gamma': 3, 'learning_rate': 0.018, 'colsample_bytree': 0.8977212703645685, 'lambda': 0.010416240662998975, 'alpha': 88.2422920389377, 'subsamp

In [80]:
# 5
preds = np.array([]) 
best_params = {}
best_values = {}  # New dictionary to store the best values
with open('best_params_values_0724_5.txt', 'w') as f:
    # Select the data for the current num
    num_data = train.copy()

    # Combine 'hour', 'day', 'week' to a new feature as stratified target
    #num_data['stratified_target'] = num_data['hour'].astype(str) + '_' + num_data['day'].astype(str) + '_' + num_data['week'].astype(str) + '_' + num_data['month'].astype(str)
    num_data['stratified_target'] = num_data['hour'].astype(str) + '_' + num_data['day'].astype(str)

    # Split the data into training and validation set
    train_df, val_df = train_test_split(num_data, test_size=0.2, stratify=num_data['stratified_target'], random_state=724)

    # Drop the temporary feature
    train_df = train_df.drop(columns='stratified_target')
    val_df = val_df.drop(columns='stratified_target')

    # Validation set에서 day 값이 5이 아닌 행을 training set에 추가
    train_df = pd.concat([train_df, val_df[~val_df['day'].isin([5])]])

    # Validation set에서 day 값이 5인 행만 남기기
    val_df = val_df[val_df['day'].isin([5])]

    y_train = train_df['power']
    y_val = val_df['power']

    x_train, x_test = train_df.drop(['date_time'],axis=1), test.drop(['date_time'],axis=1)
    x_val = val_df.drop(['date_time'],axis=1)

    x_train.drop(['power'],axis=1,inplace=True)
    x_val.drop(['power'],axis=1,inplace=True)

    x_test = x_test[x_train.columns]

    study = optuna.create_study(direction='minimize', sampler=TPESampler())
    study.optimize(lambda trial: objective_xgb(trial, x_train, y_train, x_val, y_val), n_trials=60)
    param = study.best_trial.params
    best_params = param
    best_values = study.best_trial.value  # Store the best value
    f.write(f'Best Params: {best_params}, \nBest Values: {best_values}\n\n')
    f.flush()
    xgb = XGBRegressor(**param, tree_method='gpu_hist', gpu_id=0, seed=724)
    alpha_value2 = xgb.get_params()['alpha']
    xgb.set_params(**{'objective':weighted_mse(alpha_value2)})
    ##근데 저렇게 validation 하면 학습 셋도 좀 이상해지는데 이게 맞나?0.4
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    preds_5 = np.append(preds, y_pred)


[I 2023-07-25 02:27:42,317] A new study created in memory with name: no-name-be96a9f2-5ca6-45d9-b5de-f6971ffcc369
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1, 100.0),
[I 2023-07-25 02:33:00,990] Trial 0 finished with value: 4.23528428409535 and parameters: {'n_estimators': 2592, 'max_depth': 11, 'min_child_weight': 202, 'gamma': 2, 'learning_rate': 0.016, 'colsample_bytree': 0.7215467980651388, 'lambda': 0.013316346820036856, 'alpha': 4.42876358474851, 'subsample': 0.7}. Best is trial 0 with value: 4.23528428409535.
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1, 100.0),
[I 2023-07-25 02:35:58,449] Trial 1 finished with value: 4.112433293870127 and parameters: {'n_estimators': 4597, 'max_depth': 15, 'min_child_weight': 292, 'gamma': 2, 'learning_rate': 0.01, 'colsample_bytree': 0.8044991376580062, 'lambda': 0.005536369093604251, 'alpha': 97.20166857341464, 'subsamp

In [81]:
submission = pd.read_csv('sample_submission.csv')
submission['answer_34'] = preds_34
submission['answer_5'] = preds_5
# 'num_date_time'에서 년월일 부분만 추출
submission['date'] = submission['num_date_time'].apply(lambda x: x.split(' ')[1])
# 'date'가 '20220826' 또는 '20220827'인 행의 'power_5' 값을 0으로 변경
submission.loc[submission['date'].isin(['20220826', '20220827']), 'power_5'] = submission.loc[submission['date'].isin(['20220826', '20220827']), 'answer_34']
submission.loc[submission['date'].isin(['20220828']), 'answer_34'] = submission.loc[submission['date'].isin(['20220828']), 'power_5']
submission['answer'] = (submission['answer_34'] + submission['answer_5']) / 2
z = submission.copy()
submission = pd.read_csv('sample_submission.csv')
submission['answer'] = z['answer']
submission.to_csv('submit_v18_xgb_724_seperate.csv', index = False)
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1966.514038
1,1_20220825 01,1944.657959
2,1_20220825 02,1690.885437
3,1_20220825 03,1543.995850
4,1_20220825 04,1582.993103
...,...,...
16795,100_20220831 19,914.295166
16796,100_20220831 20,807.005890
16797,100_20220831 21,737.411530
16798,100_20220831 22,657.121155


# ens

In [34]:
a = pd.read_csv('submission2.csv')
b = pd.read_csv('submit_v12_xgb_optuna_iter41_8888.csv')
c = a.copy()
c['answer'] = 0
c['answer'] = a['answer']*0.42185 + b['answer']*0.57815
#c['answer'] = (a['answer'] + b['answer']) / 2
c.to_csv('submit_v13_xgb_optuna_8512.csv',index=False)
c

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2032.590085
1,1_20220825 01,2001.954761
2,1_20220825 02,1804.802206
3,1_20220825 03,1708.447136
4,1_20220825 04,1699.192685
...,...,...
16795,100_20220831 19,943.590352
16796,100_20220831 20,848.670576
16797,100_20220831 21,764.608321
16798,100_20220831 22,661.657046


In [81]:
a = pd.read_csv('11_12_xgb_41_720_8888_ens.csv')
a['answer'] = a['answer'] * 1.05
a.to_csv('sotapp.csv',index=False)
a

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2201.706445
1,1_20220825 01,2157.875354
2,1_20220825 02,1942.385477
3,1_20220825 03,1818.851477
4,1_20220825 04,1827.483920
...,...,...
16795,100_20220831 19,923.047476
16796,100_20220831 20,850.975172
16797,100_20220831 21,780.236613
16798,100_20220831 22,650.756886


# 각 빌딩별 Validation값

In [123]:
import pandas as pd
import re

# 파일을 열어서 문자열을 읽어옵니다.
with open('best_params_values_0720v2.txt', 'r') as f:
    string = f.read()

# Extract the Iteration and Best Values
pattern = r"Iteration: (\d+),.*?Best Values: ([\d\.]+)"
matches = re.findall(pattern, string, re.DOTALL)

# Convert the matches into a DataFrame
df = pd.DataFrame(matches, columns=["Iteration", "Best Values"])
df["Iteration"] = df["Iteration"].astype(int)
df["Best Values"] = df["Best Values"].astype(float)

print(df)

    Iteration  Best Values
0           1     3.309706
1           2     6.319838
2           3     6.023522
3           4     3.278278
4           5     4.923040
..        ...          ...
95         96     2.608702
96         97     4.062261
97         98     6.098842
98         99     2.525203
99        100     4.073055

[100 rows x 2 columns]


In [124]:
info = pd.read_csv('building_info.csv')
concat = pd.concat([info,df['Best Values']],axis=1)
# Reorder the columns
cols = concat.columns.tolist()  # Convert the column names into a list
cols.insert(2, cols.pop(cols.index('Best Values')))  # Move 'Best Values' to the 3rd position (index 2)

concat = concat[cols]  # Reindex the DataFrame
concat

Unnamed: 0,건물번호,건물유형,Best Values,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,3.309706,110634.00,39570.00,-,-,-
1,2,건물기타,6.319838,122233.47,99000.00,-,-,-
2,3,건물기타,6.023522,171243.00,113950.00,40,-,-
3,4,건물기타,3.278278,74312.98,34419.62,60,-,-
4,5,건물기타,4.923040,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...,...
95,96,호텔및리조트,2.608702,93314.00,60500.00,-,-,-
96,97,호텔및리조트,4.062261,55144.67,25880.00,-,-,-
97,98,호텔및리조트,6.098842,53578.62,17373.75,-,-,-
98,99,호텔및리조트,2.525203,53499.00,40636.00,-,-,-


In [125]:
import pandas as pd
import re

# 파일을 열어서 문자열을 읽어옵니다.
with open('best_params_values_0721.txt', 'r') as f:
    string = f.read()

# Extract the Iteration and Best Values
pattern = r"Iteration: (\d+),.*?Best Values: ([\d\.]+)"
matches = re.findall(pattern, string, re.DOTALL)

# Convert the matches into a DataFrame
df = pd.DataFrame(matches, columns=["Iteration", "Best Values_day따로"])
df["Iteration"] = df["Iteration"].astype(int)
df["Best Values_day따로"] = df["Best Values_day따로"].astype(float)

print(df)

     Iteration  Best Values_day따로
0            1           3.958431
1            1           5.159142
2            2           7.236492
3            2           6.041059
4            3           5.471286
..         ...                ...
154         96           2.694907
155         97           3.705436
156         98           6.647394
157         99           2.348074
158        100           4.307969

[159 rows x 2 columns]


In [126]:
# Calculate the mean of 'Best Values' for each unique 'Iteration'
df_mean = df.groupby('Iteration')['Best Values_day따로'].mean().reset_index()
df_mean

Unnamed: 0,Iteration,Best Values_day따로
0,1,4.558786
1,2,6.638776
2,3,5.745692
3,4,3.593410
4,5,4.045336
...,...,...
95,96,2.694907
96,97,3.705436
97,98,6.647394
98,99,2.348074


In [127]:
concat = pd.concat([concat,df_mean['Best Values_day따로']],axis=1)
# Reorder the columns
cols = concat.columns.tolist()  # Convert the column names into a list
cols.insert(3, cols.pop(cols.index('Best Values_day따로')))  # Move 'Best Values' to the 3rd position (index 2)

concat = concat[cols]  # Reindex the DataFrame
concat

Unnamed: 0,건물번호,건물유형,Best Values,Best Values_day따로,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,3.309706,4.558786,110634.00,39570.00,-,-,-
1,2,건물기타,6.319838,6.638776,122233.47,99000.00,-,-,-
2,3,건물기타,6.023522,5.745692,171243.00,113950.00,40,-,-
3,4,건물기타,3.278278,3.593410,74312.98,34419.62,60,-,-
4,5,건물기타,4.923040,4.045336,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...,...,...
95,96,호텔및리조트,2.608702,2.694907,93314.00,60500.00,-,-,-
96,97,호텔및리조트,4.062261,3.705436,55144.67,25880.00,-,-,-
97,98,호텔및리조트,6.098842,6.647394,53578.62,17373.75,-,-,-
98,99,호텔및리조트,2.525203,2.348074,53499.00,40636.00,-,-,-


In [129]:
concat.to_csv('빌딩별_validation.csv',index=False, encoding = "CP949")