In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1️⃣ 데이터 불러오기
print("▶ 데이터 불러오기")
df = pd.read_csv("../call119_train.csv")

# 2️⃣ 컬럼명 정리
print("▶ 컬럼명 정리")
df.columns = [col.replace("call119_train.", "") for col in df.columns]
df.columns = df.columns.str.strip()

# 3️⃣ 날짜 파싱
print("▶ 날짜 파싱 및 시간 feature 생성")
df['tm'] = pd.to_datetime(df['tm'], format='%Y%m%d')
df['year'] = df['tm'].dt.year
df['month'] = df['tm'].dt.month
df['day'] = df['tm'].dt.day
df['weekday'] = df['tm'].dt.weekday
df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)

# 4️⃣ 습도 파생 feature
df['humidity_range'] = df['hm_max'] - df['hm_min']

# 5️⃣ 지역 원핫 인코딩
print("▶ 원핫 인코딩 수행")
df = pd.get_dummies(df, columns=['address_gu'])

# 6️⃣ 시간 순 정렬
df = df.sort_values(['sub_address', 'tm'])

# 7️⃣ 누적 강수량 feature
print("▶ 누적 강수량 feature 생성")
df['rn_day_lag1'] = df.groupby('sub_address')['rn_day'].shift(1).fillna(0)
df['rn_day_sum3'] = df.groupby('sub_address')['rn_day'].rolling(window=3, min_periods=1).sum().reset_index(0,drop=True)
df['rn_day_sum7'] = df.groupby('sub_address')['rn_day'].rolling(window=7, min_periods=1).sum().reset_index(0,drop=True)

# 8️⃣ 신고건수 lag & rolling
print("▶ 신고건수 feature 생성")
df['call_count_lag1'] = df.groupby('sub_address')['call_count'].shift(1).fillna(0)
df['call_count_sum3'] = df.groupby('sub_address')['call_count'].rolling(window=3, min_periods=1).sum().reset_index(0,drop=True)
df['call_count_sum7'] = df.groupby('sub_address')['call_count'].rolling(window=7, min_periods=1).sum().reset_index(0,drop=True)
df['call_count_mean7'] = df.groupby('sub_address')['call_count'].rolling(window=7, min_periods=1).mean().reset_index(0,drop=True)

# 9️⃣ 극한 기상 flag
df['is_heavy_rain'] = (df['rn_day'] >= 50).astype(int)
df['is_heatwave'] = (df['ta_max'] >= 33).astype(int)

# 🔟 공휴일 feature (최종 안정화 버전)
print("▶ 공휴일 feature 생성")
holidays = [
    '2020-01-01','2020-01-25','2020-01-26','2020-01-27','2020-03-01','2020-05-05','2020-05-08',
    '2020-06-06','2020-08-15','2020-08-17','2020-09-30','2020-10-01','2020-10-02','2020-10-03','2020-10-09','2020-12-25',
    '2021-01-01','2021-02-11','2021-02-12','2021-02-13','2021-03-01','2021-05-05','2021-05-19',
    '2021-06-06','2021-08-15','2021-08-16','2021-09-20','2021-09-21','2021-09-22','2021-10-03','2021-10-04','2021-10-09','2021-10-11','2021-12-25',
    '2022-01-01','2022-01-31','2022-02-01','2022-02-02','2022-03-01','2022-05-05','2022-05-08',
    '2022-06-06','2022-08-15','2022-09-09','2022-09-10','2022-09-11','2022-10-03','2022-10-09','2022-12-25',
    '2023-01-01','2023-01-21','2023-01-22','2023-01-23','2023-01-24','2023-03-01','2023-05-05','2023-05-27',
    '2023-06-06','2023-08-15','2023-09-28','2023-09-29','2023-09-30','2023-10-02','2023-10-03','2023-10-09','2023-12-25'
]
holidays = pd.to_datetime(holidays)
df['is_holiday'] = df['tm'].isin(holidays).astype(int)

# 최종 feature 리스트
feature_cols = [
    'ta_max', 'ta_min', 'ta_max_min', 
    'hm_min', 'hm_max', 'humidity_range',
    'ws_max', 'ws_ins_max',
    'rn_day', 'rn_day_lag1', 'rn_day_sum3', 'rn_day_sum7',
    'call_count_lag1', 'call_count_sum3', 'call_count_sum7', 'call_count_mean7',
    'is_heavy_rain', 'is_heatwave',
    'year', 'month', 'day', 'weekday', 'is_weekend', 'is_holiday'
] + [col for col in df.columns if col.startswith('address_gu_')]

X = df[feature_cols]
y = df['call_count']

# Train/Test 분할
print("▶ 데이터 분할")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearchCV 하이퍼파라미터 튜닝
print("▶ GridSearchCV 튜닝 시작")

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [15, 20, 25],
    'min_samples_leaf': [1, 5],
    'max_features': ['sqrt']
}

model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# 최적 파라미터 및 성능 출력
print("✔ GridSearch 완료")
print("Best parameters found: ", grid_search.best_params_)
print("Best CV RMSE: ", np.sqrt(-grid_search.best_score_))

# 테스트셋 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test Set RMSE: {rmse:.3f}")

# 최종 모델 저장
# print("▶ 최적 모델 저장")
# joblib.dump(best_model, "best_randomforest_model.pkl")
# print("✔ 모델 저장 완료: best_randomforest_model.pkl")


▶ 데이터 불러오기
▶ 컬럼명 정리
▶ 날짜 파싱 및 시간 feature 생성
▶ 원핫 인코딩 수행
▶ 누적 강수량 feature 생성
▶ 신고건수 feature 생성
▶ 공휴일 feature 생성
▶ 데이터 분할
▶ GridSearchCV 튜닝 시작
Fitting 3 folds for each of 18 candidates, totalling 54 fits
✔ GridSearch 완료
Best parameters found:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 300}
Best CV RMSE:  0.9862936924638674
Test Set RMSE: 1.205
