- 2023 전력사용량 예측 AI 경진대회
- 알고리즘 | 정형 | 시계열 | 에너지 | SMAPE
- https://dacon.io/competitions/official/236125/overview/description

- 참고 baseline 코드: https://dacon.io/competitions/official/236125/codeshare/8661?page=3&dtype=recent

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, root_mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
import joblib

# 데이터 로드
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Seasonality 특성이 약한 건물 데이터만 남기고 나머지 제거 (EDA로 마킹한 자료 기준)
check_data = pd.read_csv('building_small_seasonality.csv')
# train_data = train_data[train_data['건물번호'].isin(check_data.building_num.tolist())]
# test_data = test_data[test_data['건물번호'].isin(check_data.building_num.tolist())]
# display(train_data['건물번호'].unique())
# display(test_data['건물번호'].unique())

# 이상치 제거 (EDA로 마킹한 자료 기준)
outlier = pd.read_csv('outlier.csv')
outlier['outlier'] = outlier['outlier'].apply(lambda x: x.split('|')[1])

outlier_list = [i for i in outlier['outlier']]
# print(outlier_list)

string = ''
for idx in range(len(outlier_list)):
    string += outlier_list[idx]

outlier_list = string.split(':')
outlier_list.remove('')

# 정의된 outlier에 해당하는 데이터 행 제거
for i in outlier_list:
    i = int(i)
    train_data = train_data[~(train_data.index.isin(np.arange(i,i+20,1)))]

print('총', len(outlier_list)*20, '개 의 이상치에 해당하는 행이 삭제됐습니다')
# display(train_data)

# 결측치 처리
del train_data['일조(hr)']
del train_data['일사(MJ/m2)']

train_data['풍속(m/s)'] = train_data['풍속(m/s)'].interpolate(method='linear')
train_data['습도(%)'] = train_data['습도(%)'].interpolate(method='linear')
train_data['강수량(mm)'] = train_data['강수량(mm)'].fillna(0)
test_data['강수량(mm)'] = test_data['강수량(mm)'].fillna(0)

# 날짜 및 시간 특성 파생
train_data['일시'] = pd.to_datetime(train_data['일시'], format='%Y%m%d %H')
test_data['일시'] = pd.to_datetime(test_data['일시'], format='%Y%m%d %H')
train_data['연'] = train_data['일시'].dt.year
train_data['월'] = train_data['일시'].dt.month
train_data['일'] = train_data['일시'].dt.day
train_data['시간'] = train_data['일시'].dt.hour
test_data['연'] = test_data['일시'].dt.year
test_data['월'] = test_data['일시'].dt.month
test_data['일'] = test_data['일시'].dt.day
test_data['시간'] = test_data['일시'].dt.hour

# 필요하지 않은 컬럼 제거
train_data.drop(columns=['num_date_time'], inplace=True)
train_data.set_index('일시',inplace=True)

test_data.drop(columns=['num_date_time'], inplace=True)
test_data.set_index('일시',inplace=True)

### 컬럼 추가해보기

# 불쾌지수
train_data['THI'] = (9/5)*train_data['기온(C)'] - 0.55*(1-train_data['습도(%)'])*(9/5)*train_data['기온(C)']-26 + 32
# 체감온도
train_data['windchill'] = 13.12 + 0.6215*train_data['기온(C)'] - 11.37*train_data['풍속(m/s)']**0.16 + 0.3965*train_data['풍속(m/s)']**0.16*train_data['기온(C)']
# 일평균기온
# train_data['DayAvgTemp'] = train_data.groupby(['일'])['기온(C)'].transform('mean')
# 일최대기온
# train_data['DayMaxTemp'] = train_data.groupby(['일'])['기온(C)'].transform('max')
# display(train_data)

# 불쾌지수
test_data['THI'] = (9/5)*test_data['기온(C)'] - 0.55*(1-test_data['습도(%)'])*(9/5)*test_data['기온(C)']-26 + 32
# 체감온도
test_data['windchill'] = 13.12 + 0.6215*test_data['기온(C)'] - 11.37*test_data['풍속(m/s)']**0.16 + 0.3965*test_data['풍속(m/s)']**0.16*test_data['기온(C)']
# 일평균기온
# test_data['DayAvgTemp'] = test_data.groupby(['일'])['기온(C)'].transform('mean')
# 일최대기온
# test_data['DayMaxTemp'] = test_data.groupby(['일'])['기온(C)'].transform('max')
# display(test_data)


# 특성과 라벨 분리
X_train = train_data.drop(columns=['전력소비량(kWh)'])
y_train = train_data['전력소비량(kWh)']

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index)

# 데이터 분할 (훈련 및 검증 세트)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42, shuffle=False)

# Data Leakage 문제 예방하기 위해 데이터 비중이 아니라 일별 기준으로 train, valid set 분리
# X_train_split = X_train_scaled[(X_train_scaled.index>='2022-06-01')&(X_train_scaled.index<'2022-08-15')]
# X_val_split = X_train_scaled[(X_train_scaled.index>='2022-08-15')]
# y_train_split = y_train[(y_train.index>='2022-06-01')&(y_train.index<'2022-08-15')]
# y_val_split = y_train[(y_train.index>='2022-08-15')]

X_train_split, X_val_split = X_train_split.to_numpy(), X_val_split.to_numpy()
# print(X_train_split)

# 하이퍼파라미터 그리드 정의
# param_grid = {
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'n_estimators': [5000],
#     'subsample': [0.7, 0.9],
#     'colsample_bytree': [0.7, 0.9],
# }

# SMAPE(성능지표) 정의
# def SMAPE(true, pred):
#     return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

# smape = make_scorer(SMAPE, greater_is_better=False)

# XGBoost 모델 생성
# xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# xgb = XGBRegressor(objective='reg:squarederror',
#                    max_depth=7,
#                    subsample=0.9,
#                    colsample_bytree=0.9,
#                    random_state=42)

# random search로 best parameter 찾기
# grid_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid,
#                            cv=3, n_jobs=3, verbose=2, scoring=smape)

# grid search로 best parameter 찾기
# param_grid = {
#     'n_estimators':[2000,3000,5000],
#     'learning_rate': [0.01,0.05,0.1],
# }

# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=2, scoring='neg_root_mean_squared_error',
#                            n_jobs=3, verbose=2)

# grid_search.fit(X_train_split, y_train_split,
#                 eval_set=[(X_val_split, y_val_split)],
#                 early_stopping_rounds=10,
#                 verbose=True
#                 )

# 최종 모델으로 학습
# print('Best Parameter:', grid_search.best_params_)
# model = grid_search.best_estimator_

model = XGBRegressor(objective='reg:squarederror',
                    n_estimators=2000,
                    max_depth=7,
                    learning_rate=0.01,
                    subsample=0.9,
                    colsample_bytree=0.9,
                    random_state=42,
                    )

# 교차검증 시행
# scores = cross_validate(model, X_train_split, y_train_split, cv=2,
#                         scoring='neg_root_mean_squared_error',
#                         n_jobs=3)

# print(scores)

# 모델 학습 및 파일로 저장
model.fit(X_train_split, y_train_split)

filename = 'test-xgboost-model.sav'
joblib.dump(model, filename)

# 추론 결과 저장
y_pred = model.predict(X_test_scaled)
submission = pd.read_csv('sample_submission.csv')
submission['answer'] = y_pred

submission.to_csv('my_submission.csv', index=False)


총 1860 개 의 이상치에 해당하는 행이 삭제됐습니다
