In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import matplotlib.dates as mdates
from glob import glob
from math import sqrt

# Scikit-learn 관련 라이브러리
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# 추가적인 머신러닝 모델 및 최적화 라이브러리
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from skopt import BayesSearchCV


In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def data_preprocessing(path):
    # 데이터 로드
    df = pd.read_excel(path, engine='openpyxl')

    # IQR 계산
    Q1_Mosquitoe = df['mosquito'].quantile(0.25)
    Q3_Mosquitoe = df['mosquito'].quantile(0.75)
    IQR_Mosquitoe = Q3_Mosquitoe - Q1_Mosquitoe
    lower_bound_Mosquitoe = Q1_Mosquitoe - 1.5 * IQR_Mosquitoe
    upper_bound_Mosquitoe = Q3_Mosquitoe + 1.5 * IQR_Mosquitoe

    # 이상치 제거
    df_iqr = df[(df['mosquito'] >= lower_bound_Mosquitoe) & (df['mosquito'] <= upper_bound_Mosquitoe)]

    # 결과 출력
    print(f"Q1: {Q1_Mosquitoe}, Q3: {Q3_Mosquitoe}")
    print(f"이상치 제거 전 데이터 개수: {len(df)}, 이상치 제거 후 데이터 개수: {len(df_iqr)}")

    # Train-Test 분리
    train = df_iqr[df_iqr['DATE'] <= '2023-10-31']
    test = df_iqr[df_iqr['DATE'] >= '2024-04-01']

    # 날짜(datetime64) 컬럼 제거
    if 'DATE' in train.columns:
        train = train.drop(columns=['DATE'])
        test = test.drop(columns=['DATE'])

    # 범주형(object) 데이터 제거
    train = train.select_dtypes(exclude=['object'])
    test = test.select_dtypes(exclude=['object'])

    # MinMaxScaler 적용
    scaler = MinMaxScaler()
    train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns)
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns)

    # 독립 변수(X)와 종속 변수(y) 분리
    target_col = 'mosquito'
    X_train = train_scaled.drop(columns=[target_col])
    y_train = train_scaled[target_col]
    X_test = test_scaled.drop(columns=[target_col])
    y_test = test_scaled[target_col]

    # 데이터 크기 확인
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

    # 데이터 반환
    return X_train, y_train, X_test, y_test, train, test


In [16]:
# 공통 모델 학습 함수
def train_model(model, param_space, path, n_iter_count, save_path, region, model_name):
    X_train, y_train, X_test, y_test, train, test = data_preprocessing(path)

    # Bayesian Optimization 수행
    bayes_cv = BayesSearchCV(
        model,
        param_space,
        n_iter=n_iter_count,
        cv=3,
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    bayes_cv.fit(X_train, y_train)

    # 최적 모델 저장
    os.makedirs(save_path, exist_ok=True)
    joblib.dump(bayes_cv.best_estimator_, os.path.join(save_path, f"{model_name}.pkl"))

    # 성능 평가
    y_pred_train = bayes_cv.best_estimator_.predict(X_train)
    y_pred_test = bayes_cv.best_estimator_.predict(X_test)
    
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = root_mean_squared_error(y_train, y_pred_train)
    test_rmse = root_mean_squared_error(y_test, y_pred_test)

    print(f"📊 Best Hyperparameters for {model_name}: {bayes_cv.best_params_}")
    print(f"✅ {model_name} RMSE (Train): {train_rmse:.4f}, RMSE (Test): {test_rmse:.4f}")
    print(f"✅ {model_name} R² (Train): {train_r2:.4f}, R² (Test): {test_r2:.4f}")

    return bayes_cv.best_params_, bayes_cv.best_estimator_, train_r2, test_r2, train_rmse, test_rmse, model_name



# 모델 실행 함수
def run_models(path, n_iter_count, save_path, region):
    models = {
        "rf": (RandomForestRegressor(), {'n_estimators': (10, 300), 'max_depth': (1, 200), 'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20)}),
        "gb": (GradientBoostingRegressor(), {'n_estimators': (10, 300), 'max_depth': (1, 200), 'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'learning_rate': (0.01, 1.0, 'log-uniform')}),
        "xgb": (XGBRegressor(), {'n_estimators': (10, 300), 'max_depth': (1, 200), 'learning_rate': (0.01, 1.0, 'log-uniform'), 'subsample': (0.1, 1.0), 'colsample_bytree': (0.1, 1.0)}),
        "lgbm": (LGBMRegressor(), {'n_estimators': (10, 300), 'max_depth': (1, 200), 'learning_rate': (0.01, 1.0, 'log-uniform'), 'num_leaves': (2, 100), 'min_child_samples': (1, 50)})
    }

    results = {}
    for model_name, (model, param_space) in models.items():
        results[model_name] = train_model(model, param_space, path, n_iter_count, save_path, region, model_name)

    return results

# 모델 로드 및 예측 함수
def load_and_predict(path, save_path, model_name):
    _, _, _, _, train, test = data_preprocessing(path)
    model = joblib.load(os.path.join(save_path, f"{model_name}.pkl"))
    
    y_test_pred = model.predict(test.drop(columns=['DATE', 'mosquito']))
    
    # 예측 결과 시각화
    plt.figure(figsize=(10, 5))
    plt.plot(test['DATE'], test['mosquito'], label='Actual', color='black')
    plt.plot(test['DATE'], y_test_pred, label='Predicted', color='red')
    plt.legend()
    plt.xlabel("Date")
    plt.ylabel("Mosquito Count")
    plt.title(f"{model_name} Prediction")
    plt.grid()
    plt.show()


In [None]:
path = r'F:\박정현\ML\Mosquito\data\2015_2024_total.xlsx'
save_path = r'F:\박정현\ML\Mosquito\models'

n_iter_count = 5
region = "Seoul"

# 모델 학습 실행
results = run_models(path, n_iter_count, save_path, region)


Q1: 12.0, Q3: 53.0
이상치 제거 전 데이터 개수: 78909, 이상치 제거 후 데이터 개수: 72171
X_train: (64600, 17), y_train: (64600,)
X_test: (7571, 17), y_test: (7571,)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
📊 Best Hyperparameters for rf: OrderedDict([('max_depth', 90), ('min_samples_leaf', 18), ('min_samples_split', 4), ('n_estimators', 136)])
✅ rf RMSE (Train): 0.1333, RMSE (Test): 0.2101
✅ rf R² (Train): 0.6517, R² (Test): -0.0709
Q1: 12.0, Q3: 53.0
이상치 제거 전 데이터 개수: 78909, 이상치 제거 후 데이터 개수: 72171
X_train: (64600, 17), y_train: (64600,)
X_test: (7571, 17), y_test: (7571,)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 fold