In [None]:
import pandas as pd
import numpy as np
import random
import os

import xgboost as xgb
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm


#############################################################################################################33
from sklearn.ensemble import RandomForestRegressor         # 회귀 (Regression)
from sklearn.linear_model import LinearRegression          # 선형 회귀 (Linear Regression)
from sklearn.svm import SVR                                # 서포트 벡터 머신 (Support Vector Machine, SVM)
from sklearn.neighbors import KNeighborsRegressor          # k-최근접 이웃 (k-Nearest Neighbors, k-NN)
from sklearn.neural_network import MLPRegressor            # 신경망 (Neural Networks)
from sklearn.tree import DecisionTreeRegressor             # 결정 트리 (Decision Tree)
from sklearn.ensemble import GradientBoostingRegressor     # 그래디언트 부스팅 (Gradient Boosting)

import warnings
warnings.filterwarnings(action='ignore')
#############################################################################################################33


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)  # Seed 고정


# Define SMAPE loss function
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100


# 이슬점 계산 함수
def calculate_dew_point(temp_c, humidity):
    a = 17.27
    b = 237.7
    alpha = ((a * temp_c) / (b + temp_c)) + np.log(humidity / 100.0)
    dew_point = (b * alpha) / (a - alpha)
    return dew_point


def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)] - 26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)] - 26))
    return np.array(ys)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_df = pd.read_csv('./train.csv')
building_info = pd.read_csv('./building_info.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

## Building Data

In [None]:
building_info = building_info.replace('-',0) # ''-'(문자)라고 명시된 값을 0(숫자)으로 바꾸기


# '건물유형' 라벨인코딩해서 '건물유형' 칼럼에 덮어쓰기
label_encoder = LabelEncoder()
building_info['건물유형'] = label_encoder.fit_transform(building_info['건물유형'])


# "태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)" 컬럼을 float으로 변환
building_info["태양광용량(kW)"] = pd.to_numeric(building_info["태양광용량(kW)"], errors="coerce")
building_info["ESS저장용량(kWh)"] = pd.to_numeric(building_info["ESS저장용량(kWh)"], errors="coerce")
building_info["PCS용량(kW)"] = pd.to_numeric(building_info["PCS용량(kW)"], errors="coerce")

building_info.head(10)

## Train Data

In [None]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눔
train_df['year'] = train_df['일시'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

# 요일 추가
train_df['date'] = pd.to_datetime(train_df[['year', 'month', 'day']])
train_df['요일'] = train_df['date'].dt.dayofweek
train_df = train_df.drop(columns=['date','year'])                                 # year 드롭

# 공휴일 추가
train_df['공휴일'] = train_df['요일'].apply(lambda x: 1 if x in [5, 6] else 0)
train_df.loc[(train_df['month'] == 6) & (train_df['day'] == 1), '공휴일'] = 1     # 2022 지방선거
train_df.loc[(train_df['month'] == 6) & (train_df['day'] == 6), '공휴일'] = 1     # 현충일
train_df.loc[(train_df['month'] == 8) & (train_df['day'] == 15), '공휴일'] = 1    # 광복절

# 근무시간 여부 추가
train_df.loc[(train_df['time'] >= 8) & (train_df['time'] <= 20), '근무시간'] = 1
train_df.loc[(train_df['time'] < 8) | (train_df['time'] > 20), '근무시간'] = 0

# 불쾌지수 칼럼 추가
train_df['불쾌지수'] = (1.8*train_df['기온(C)']) - (0.55*(1-train_df['습도(%)']/100)*(1.8*train_df['기온(C)']-26)) + 32

# 이슬점 추가
# train_df['이슬점'] = train_df.apply(lambda row: calculate_dew_point(row['기온(C)'], row['습도(%)']), axis=1)

# 냉방도일 추가
# cdhs = np.array([])
# for num in range(1,101,1):
#     temp = train_df[train_df['건물번호'] == num]
#     cdh = CDH(temp['기온(C)'].values)
#     cdhs = np.concatenate([cdhs, cdh])
# train_df['냉방도일'] = cdhs

# sin, cos tiem 변수 추가
train_df['sin_time'] = np.sin(2*np.pi*train_df.time/24)
train_df['cos_time'] = np.cos(2*np.pi*train_df.time/24)


####################################################################################################


test_df['year'] = test_df['일시'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8])) 
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

# 요일 추가
test_df['date'] = pd.to_datetime(test_df[['year', 'month', 'day']])
test_df['요일'] = test_df['date'].dt.dayofweek
test_df = test_df.drop(columns=['date','year'])                                 # year 드롭

# 공휴일 추가
test_df['공휴일'] = test_df['요일'].apply(lambda x: 1 if x in [5, 6] else 0)
test_df.loc[(test_df['month'] == 6) & (test_df['day'] == 1), '공휴일'] = 1     # 2022 지방선거
test_df.loc[(test_df['month'] == 6) & (test_df['day'] == 6), '공휴일'] = 1     # 현충일
test_df.loc[(test_df['month'] == 8) & (test_df['day'] == 15), '공휴일'] = 1    # 광복절

# 근무시간 여부 추가
test_df.loc[(train_df['time'] >= 8) & (test_df['time'] <= 20), '근무시간'] = 1
test_df.loc[(train_df['time'] < 8) | (test_df['time'] > 20), '근무시간'] = 0

# 불쾌지수 칼럼 추가
test_df['불쾌지수'] = (1.8*test_df['기온(C)']) - (0.55*(1-test_df['습도(%)']/100)*(1.8*test_df['기온(C)']-26)) + 32

# 이슬점 추가
# test_df['이슬점'] = test_df.apply(lambda row: calculate_dew_point(row['기온(C)'], row['습도(%)']), axis=1)


# 냉방도일 추가
# cdhs = np.array([])
# for num in range(1,101,1):
#     temp = test_df[test_df['건물번호'] == num]
#     cdh = CDH(temp['기온(C)'].values)
#     cdhs = np.concatenate([cdhs, cdh])
# test_df['냉방도일'] = cdhs

# sin, cos tiem 변수 추가
test_df['sin_time'] = np.sin(2*np.pi*test_df.time/24)
test_df['cos_time'] = np.cos(2*np.pi*test_df.time/24)


####################################################################################################


# 풍속, 습도 결측치 처리
# 평균으로 채우고 반올림
# train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(),2), inplace=True)
# train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(),2), inplace=True)


# 직전값으로 채우기
train_df['풍속(m/s)'].fillna(method='ffill', inplace=True)
train_df['습도(%)'].fillna(method='ffill', inplace=True)


# 보간법으로 채우기
# train_df['풍속(m/s)'].interpolate() # Nan값을 앞뒤의 값의 동일 간격으로 채우기
# train_df['습도(%)'].interpolate() # Nan값을 앞뒤의 값의 동일 간격으로 채우기


# 나머지 결측값을 0으로 채우기
train_df = train_df.fillna(0)

train_df.head(10)

## Test Data

In [None]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

In [None]:
# 칼럼 지우기
train_x = train_df.drop(columns=['num_date_time','일시','일조(hr)','일사(MJ/m2)','전력소비량(kWh)'])
train_x.head()

In [None]:
# # '건물번호', '일시', '전력소비량(kWh)' 열을 제외한 열 선택
# feature_columns = train_x.columns.difference(['건물번호', '일시', '전력소비량(kWh)'])

# # Min-Max 스케일링을 위한 Scaler 생성
# scaler = MinMaxScaler()

# # 선택한 열들을 정규화
# train_x[feature_columns] = scaler.fit_transform(train_x[feature_columns])

# train_x.head()

In [None]:
train_y = train_df['전력소비량(kWh)']
train_y.head(10)  # 정답지

## Test Data Pre-Processing

In [None]:
# XGBoost 모델 초기화
# model = xgb.XGBRegressor(n_estimators=800, max_depth=800, learning_rate=0.1, min_child_weight=1)
model = xgb.XGBRegressor(n_estimators=2000, max_depth=800, learning_rate=0.1, min_child_weight=1)


# model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#                          colsample_bynode=1, colsample_bytree=0.8, eta=0.01, gamma=0,
#                          gpu_id=-1, importance_type='gain', interaction_constraints='',
#                          learning_rate=0.00999999978, max_delta_step=0, max_depth=5,
#                          min_child_weight=6, monotone_constraints='()',
#                          n_estimators=10000, n_jobs=0, num_parallel_tree=1, random_state=0,
#                          reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
#                          subsample=0.9, tree_method='exact', validate_parameters=1,
#                          verbosity=None)

# model= xgb.XGBRegressor(n_estimators =10000, eta = 0.01, min_child_weight = 6, 
#                        max_depth = 5, colsample_bytree = 0.8, 
#                        subsample = 0.9, seed=0) # ,  gpu_id = 1, tree_method = 'gpu_hist', predictor= 'gpu_predictor')

# 모델 훈련
model.fit(train_x, train_y)

In [None]:
import matplotlib.pyplot as plt

# 훈련된 모델로부터 예측값 얻기
plot_1 = model.predict(train_x)
plot_2 = train_y

# 실제 레이블과 예측값 비교 그래프 그리기
plt.scatter(plot_1, plot_2, s=0.1, c='red', label='Predict_X')
plt.plot(plot_2, plot_2, color='blue', linewidth=1, label='y')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Predict")
plt.legend()
plt.show()

In [None]:
# 훈련 데이터의 예측 결과와 실제 레이블 간의 결정 계수 계산
r2 = r2_score(train_y, plot_1)
print(f"결정 계수 (R-squared): {r2:.8f}")

In [None]:
# SMAPE
# print('SMAPE : {}'.format(SMAPE(y_valid, pred)))
print('SMAPE : {}'.format(SMAPE(train_y, plot_1)))

In [None]:
# from sklearn.metrics import mean_squared_error

# # MSE 계산
# mse = mean_squared_error(test_y, predictions)
# print(f"평균 제곱 오차 (MSE): {mse:.8f}")

In [None]:
test_x = test_df.drop(columns=['num_date_time', '일시'])
test_x.head()

In [None]:
train_x.head()

## Inference

In [None]:
# 테스트 데이터로 예측 시작
preds = model.predict(test_x)
print(preds)

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission.head()

In [None]:
submission['answer'] = preds
submission.head()

In [None]:
# csv 파일로 저장
submission.to_csv('./XG_baseline_submission-final.csv', index=False)