<a href="https://colab.research.google.com/github/jylee2930/Basic_BIgDataAnalysis/blob/main/RandomForest%2B_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 머신러닝 라이브러리
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 딥러닝 라이브러리
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# 한글 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

# GPU 설정
print("TensorFlow 버전:", tf.__version__)
print("GPU 사용 가능:", tf.config.list_physical_devices('GPU'))
if tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)


class RandomForestLSTMPredictor:
    """
    Random Forest + LSTM 하이브리드 기온 예측 모델
    """

    def __init__(self, sequence_length=30, rf_n_estimators=100, lstm_units=64):
        """
        초기화

        Args:
            sequence_length (int): LSTM용 시퀀스 길이
            rf_n_estimators (int): Random Forest의 트리 개수
            lstm_units (int): LSTM 유닛 수
        """
        self.sequence_length = sequence_length
        self.rf_n_estimators = rf_n_estimators
        self.lstm_units = lstm_units

        # 모델들
        self.rf_model = None
        self.lstm_model = None

        # 스케일러들
        self.scaler_rf_features = StandardScaler()
        self.scaler_lstm_target = MinMaxScaler()

        # 데이터 저장
        self.data = None
        self.rf_feature_cols = None
        self.lstm_feature_cols = None

        # 훈련 기록
        self.history = None

    def load_and_preprocess_data(self, filepath):
        """
        데이터 로드 및 전처리
        """
        print("=" * 60)
        print("Random Forest + LSTM 하이브리드 기온 예측 모델")
        print("=" * 60)
        print("데이터 로딩 및 전처리 시작")

        # CSV 파일 읽기
        df = pd.read_csv(filepath)

        # 날짜 컬럼 정리
        df['date'] = df['date'].str.replace('\t', '').str.strip()
        df = df.dropna(subset=['date', 'mean_tmp', 'min_tmp', 'max_tmp'])
        df = df[df['date'] != '']
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)

        print(f"데이터 로드 완료: {len(df)}일의 기온 데이터")
        print(f"데이터 기간: {df['date'].min().strftime('%Y-%m-%d')} ~ {df['date'].max().strftime('%Y-%m-%d')}")

        # 기본 날짜 특성
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['day_of_year'] = df['date'].dt.dayofyear
        df['day_of_week'] = df['date'].dt.dayofweek
        df['quarter'] = df['date'].dt.quarter

        # 계절성 특성 (순환적)
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
        df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
        df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

        # 기온 관련 특성
        df['temp_range'] = df['max_tmp'] - df['min_tmp']
        df['temp_mid'] = (df['max_tmp'] + df['min_tmp']) / 2

        # 이동평균 특성
        for window in [3, 7, 14, 30]:
            df[f'mean_tmp_ma_{window}'] = df['mean_tmp'].rolling(window=window, min_periods=1).mean()
            df[f'temp_range_ma_{window}'] = df['temp_range'].rolling(window=window, min_periods=1).mean()
            df[f'min_tmp_ma_{window}'] = df['min_tmp'].rolling(window=window, min_periods=1).mean()
            df[f'max_tmp_ma_{window}'] = df['max_tmp'].rolling(window=window, min_periods=1).mean()

        # 이동표준편차
        for window in [7, 14]:
            df[f'mean_tmp_std_{window}'] = df['mean_tmp'].rolling(window=window, min_periods=1).std()

        # 기온 변화율
        df['temp_change_1d'] = df['mean_tmp'].diff(1)
        df['temp_change_3d'] = df['mean_tmp'].diff(3)
        df['temp_change_7d'] = df['mean_tmp'].diff(7)

        # 계절별 이상값
        monthly_mean = df.groupby('month')['mean_tmp'].transform('mean')
        df['temp_seasonal_anomaly'] = df['mean_tmp'] - monthly_mean

        # 과거 기온 패턴 (Random Forest용)
        for lag in [1, 2, 3, 7, 14]:
            df[f'mean_tmp_lag_{lag}'] = df['mean_tmp'].shift(lag)
            df[f'temp_range_lag_{lag}'] = df['temp_range'].shift(lag)

        # 결측값 처리
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(method='bfill').fillna(method='ffill')

        # 남은 결측값 제거
        df = df.dropna().reset_index(drop=True)

        print(f"✅ 특성 생성 완료: 총 {len(df.columns)}개 특성")

        self.data = df
        return df

    def prepare_rf_features(self, df):
        """
        Random Forest를 위한 특성 준비
        """
        self.rf_feature_cols = [
            'month', 'day_of_year', 'day_of_week', 'quarter',
            'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos',
            'day_of_week_sin', 'day_of_week_cos',
            'min_tmp', 'max_tmp', 'temp_range', 'temp_mid',
            'mean_tmp_ma_3', 'mean_tmp_ma_7', 'mean_tmp_ma_14', 'mean_tmp_ma_30',
            'temp_range_ma_3', 'temp_range_ma_7', 'temp_range_ma_14',
            'min_tmp_ma_3', 'min_tmp_ma_7', 'max_tmp_ma_3', 'max_tmp_ma_7',
            'mean_tmp_std_7', 'mean_tmp_std_14',
            'temp_change_1d', 'temp_change_3d', 'temp_change_7d',
            'temp_seasonal_anomaly',
            'mean_tmp_lag_1', 'mean_tmp_lag_2', 'mean_tmp_lag_3',
            'mean_tmp_lag_7', 'mean_tmp_lag_14',
            'temp_range_lag_1', 'temp_range_lag_3', 'temp_range_lag_7'
        ]

        return df[self.rf_feature_cols].values

    def prepare_lstm_data(self, df, target_col='mean_tmp'):
        """
        LSTM을 위한 시퀀스 데이터 준비
        """
        # LSTM용 특성 선택
        self.lstm_feature_cols = [
            'mean_tmp', 'min_tmp', 'max_tmp', 'temp_range', 'temp_mid',
            'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos',
            'mean_tmp_ma_3', 'mean_tmp_ma_7', 'mean_tmp_ma_14',
            'temp_change_1d', 'temp_change_3d', 'temp_seasonal_anomaly'
        ]

        # 특성 데이터 추출
        feature_data = df[self.lstm_feature_cols].values

        # 타겟 데이터
        target_data = df[target_col].values.reshape(-1, 1)
        target_scaled = self.scaler_lstm_target.fit_transform(target_data)

        # 시퀀스 데이터 생성
        X, y = [], []

        for i in range(self.sequence_length, len(feature_data)):
            X.append(feature_data[i-self.sequence_length:i])
            y.append(target_scaled[i])

        X = np.array(X)
        y = np.array(y)

        print(f"LSTM 데이터 준비 완료:")
        print(f"X shape: {X.shape} (samples, timesteps, features)")
        print(f"y shape: {y.shape}")

        return X, y

    def split_data(self, df, train_ratio=0.7, val_ratio=0.15):
        """
        시계열 데이터 분할
        """
        n_samples = len(df)
        train_size = int(n_samples * train_ratio)
        val_size = int(n_samples * val_ratio)

        train_df = df[:train_size].copy()
        val_df = df[train_size:train_size + val_size].copy()
        test_df = df[train_size + val_size:].copy()

        print(f"데이터 분할:")
        print(f"훈련: {len(train_df)}일, 검증: {len(val_df)}일, 테스트: {len(test_df)}일")

        return train_df, val_df, test_df

    def train_random_forest(self, train_df, val_df):
        """
        Random Forest 모델 훈련
        """
        print("\nRandom Forest 모델 훈련 중...")

        # 특성 준비
        X_train = self.prepare_rf_features(train_df)
        y_train = train_df['mean_tmp'].values
        X_val = self.prepare_rf_features(val_df)
        y_val = val_df['mean_tmp'].values

        # 특성 정규화
        X_train_scaled = self.scaler_rf_features.fit_transform(X_train)
        X_val_scaled = self.scaler_rf_features.transform(X_val)

        # Random Forest 모델 생성 및 훈련
        self.rf_model = RandomForestRegressor(
            n_estimators=self.rf_n_estimators,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1,
            verbose=0
        )

        self.rf_model.fit(X_train_scaled, y_train)

        # 검증 성능
        y_val_pred = self.rf_model.predict(X_val_scaled)
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_r2 = r2_score(y_val, y_val_pred)

        print(f"Random Forest 검증 성능:")
        print(f"RMSE: {val_rmse:.3f}°C, MAE: {val_mae:.3f}°C, R²: {val_r2:.3f}")

        # 특성 중요도 출력
        feature_importance = pd.DataFrame({
            'feature': self.rf_feature_cols,
            'importance': self.rf_model.feature_importances_
        }).sort_values('importance', ascending=False)

        print(f"상위 10개 중요 특성:")
        for i in range(min(10, len(feature_importance))):
            feat = feature_importance.iloc[i]
            print(f"      {i+1:2d}. {feat['feature']:<20} ({feat['importance']:.4f})")

        return val_rmse, val_mae, val_r2

    def build_lstm_model(self, input_shape):
        """
        LSTM 모델 구축
        """
        model = Sequential([
            # 첫 번째 LSTM 레이어
            LSTM(self.lstm_units * 2, return_sequences=True, input_shape=input_shape,
                 dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # 두 번째 LSTM 레이어
            LSTM(self.lstm_units, return_sequences=True,
                 dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # 세 번째 LSTM 레이어
            LSTM(self.lstm_units // 2, return_sequences=False,
                 dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # Dense 레이어들
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1)
        ])

        # 컴파일
        optimizer = Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

        return model

    def train_lstm(self, train_df, val_df, epochs=30, batch_size=32):
        """
        LSTM 모델 훈련
        """
        print("\nLSTM 모델 훈련 중...")

        # LSTM 데이터 준비
        X_train, y_train = self.prepare_lstm_data(train_df)
        X_val, y_val = self.prepare_lstm_data(val_df)

        # 모델 구축
        self.lstm_model = self.build_lstm_model((X_train.shape[1], X_train.shape[2]))

        print(f"LSTM 모델 구조:")
        self.lstm_model.summary()

        # 콜백 설정
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7, verbose=1),
            ModelCheckpoint('best_lstm_model.h5', monitor='val_loss', save_best_only=True, verbose=0)
        ]

        # 모델 훈련
        print(f"모델 학습 시작 (최대 {epochs} 에폭)...")
        self.history = self.lstm_model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )

        # 검증 성능
        y_val_pred_scaled = self.lstm_model.predict(X_val, verbose=0)
        y_val_pred = self.scaler_lstm_target.inverse_transform(y_val_pred_scaled).flatten()
        y_val_actual = self.scaler_lstm_target.inverse_transform(y_val).flatten()

        val_rmse = np.sqrt(mean_squared_error(y_val_actual, y_val_pred))
        val_mae = mean_absolute_error(y_val_actual, y_val_pred)
        val_r2 = r2_score(y_val_actual, y_val_pred)

        print(f"LSTM 검증 성능:")
        print(f"RMSE: {val_rmse:.3f}°C, MAE: {val_mae:.3f}°C, R²: {val_r2:.3f}")

        return val_rmse, val_mae, val_r2

    def predict_hybrid(self, test_df, rf_weight=0.4, lstm_weight=0.6):
        """
        하이브리드 모델 예측 (Random Forest + LSTM)
        """
        print(f"\n🔮 하이브리드 예측 수행 중 (RF:{rf_weight:.1f}, LSTM:{lstm_weight:.1f})...")

        # Random Forest 예측
        X_test_rf = self.prepare_rf_features(test_df)
        X_test_rf_scaled = self.scaler_rf_features.transform(X_test_rf)
        rf_predictions = self.rf_model.predict(X_test_rf_scaled)

        # LSTM 예측
        X_test_lstm, y_test_lstm = self.prepare_lstm_data(test_df)
        lstm_pred_scaled = self.lstm_model.predict(X_test_lstm, verbose=0)
        lstm_predictions = self.scaler_lstm_target.inverse_transform(lstm_pred_scaled).flatten()

        # 실제값 (LSTM 기준으로 맞춤)
        actual_values = self.scaler_lstm_target.inverse_transform(y_test_lstm).flatten()

        # Random Forest 예측값을 LSTM과 길이 맞추기
        rf_aligned = rf_predictions[self.sequence_length:]

        # 하이브리드 예측 (가중 평균)
        hybrid_predictions = rf_weight * rf_aligned + lstm_weight * lstm_predictions

        results = {
            'actual': actual_values,
            'rf': rf_aligned,
            'lstm': lstm_predictions,
            'hybrid': hybrid_predictions,
            'test_dates': test_df['date'].iloc[self.sequence_length:].values
        }

        return results

    def evaluate_predictions(self, results):
        """
        예측 결과 평가
        """
        print("\n모델 성능 평가:")
        print("=" * 50)

        models = ['rf', 'lstm', 'hybrid']
        model_names = {
            'rf': 'Random Forest',
            'lstm': 'LSTM',
            'hybrid': 'RF+LSTM Hybrid'
        }

        evaluation_results = {}

        for model in models:
            actual = results['actual']
            pred = results[model]

            rmse = np.sqrt(mean_squared_error(actual, pred))
            mae = mean_absolute_error(actual, pred)
            r2 = r2_score(actual, pred)

            evaluation_results[model] = {
                'RMSE': rmse,
                'MAE': mae,
                'R²': r2
            }

            print(f"{model_names[model]:<20} RMSE: {rmse:.3f}°C  MAE: {mae:.3f}°C  R²: {r2:.3f}")

        # 최고 성능 모델
        best_model = min(evaluation_results.keys(), key=lambda x: evaluation_results[x]['RMSE'])
        print("=" * 50)
        print(f"최고 성능: {model_names[best_model]} (RMSE: {evaluation_results[best_model]['RMSE']:.3f}°C)")

        return evaluation_results

    def plot_training_history(self):
        """
        LSTM 훈련 과정 시각화
        """
        if self.history is None:
            print("훈련 기록이 없습니다.")
            return

        fig, axes = plt.subplots(1, 2, figsize=(15, 5))

        # Loss 그래프
        axes[0].plot(self.history.history['loss'], label='Training Loss', linewidth=2, color='blue')
        axes[0].plot(self.history.history['val_loss'], label='Validation Loss', linewidth=2, color='red')
        axes[0].set_title('LSTM Model Training Loss', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Loss (MSE)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)

        # MAE 그래프
        axes[1].plot(self.history.history['mae'], label='Training MAE', linewidth=2, color='blue')
        axes[1].plot(self.history.history['val_mae'], label='Validation MAE', linewidth=2, color='red')
        axes[1].set_title('LSTM Model Training MAE', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('MAE')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    def visualize_predictions(self, results, n_days=180):
        """
        예측 결과 시각화
        """
        print("\n예측 결과 시각화 중...")

        fig, axes = plt.subplots(2, 2, figsize=(20, 12))

        # 색상 설정
        colors = {
            'actual': '#2c3e50',
            'rf': '#e74c3c',
            'lstm': '#9b59b6',
            'hybrid': '#27ae60'
        }

        # 최근 n_days만 표시
        n_show = min(n_days, len(results['actual']))

        # 1. 시계열 예측 비교
        axes[0, 0].plot(results['actual'][-n_show:], color=colors['actual'],
                       linewidth=2, label='Actual', alpha=0.9)
        axes[0, 0].plot(results['rf'][-n_show:], color=colors['rf'],
                       linewidth=1.5, label='Random Forest', alpha=0.7)
        axes[0, 0].plot(results['lstm'][-n_show:], color=colors['lstm'],
                       linewidth=1.5, label='LSTM', alpha=0.7)
        axes[0, 0].plot(results['hybrid'][-n_show:], color=colors['hybrid'],
                       linewidth=2, label='RF+LSTM Hybrid', alpha=0.8)

        axes[0, 0].set_title(f'Temperature Prediction Comparison (Last {n_show} days)',
                            fontsize=14, fontweight='bold')
        axes[0, 0].set_xlabel('Days')
        axes[0, 0].set_ylabel('Temperature (°C)')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # 2. 하이브리드 모델 산점도
        axes[0, 1].scatter(results['actual'], results['hybrid'], alpha=0.6, s=20, color=colors['hybrid'])
        min_val, max_val = min(results['actual'].min(), results['hybrid'].min()), \
                           max(results['actual'].max(), results['hybrid'].max())
        axes[0, 1].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, alpha=0.8)

        r2_hybrid = r2_score(results['actual'], results['hybrid'])
        axes[0, 1].set_title(f'RF+LSTM Hybrid: Actual vs Predicted (R² = {r2_hybrid:.3f})',
                            fontsize=14, fontweight='bold')
        axes[0, 1].set_xlabel('Actual Temperature (°C)')
        axes[0, 1].set_ylabel('Predicted Temperature (°C)')
        axes[0, 1].grid(True, alpha=0.3)

        # 3. 잔차 분석
        residuals = results['actual'] - results['hybrid']
        axes[1, 0].scatter(results['hybrid'], residuals, alpha=0.6, s=20, color='green')
        axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2, alpha=0.8)
        axes[1, 0].set_title('Hybrid Model Residual Analysis', fontsize=14, fontweight='bold')
        axes[1, 0].set_xlabel('Predicted Temperature (°C)')
        axes[1, 0].set_ylabel('Residual (Actual - Predicted)')
        axes[1, 0].grid(True, alpha=0.3)

        # 4. 오차 분포 히스토그램
        models = ['rf', 'lstm', 'hybrid']
        model_names = ['Random Forest', 'LSTM', 'RF+LSTM Hybrid']

        for model, name, color in zip(models, model_names, [colors['rf'], colors['lstm'], colors['hybrid']]):
            errors = results['actual'] - results[model]
            axes[1, 1].hist(errors, bins=30, alpha=0.6, label=f'{name} (σ={np.std(errors):.2f})',
                           color=color, density=True)

        axes[1, 1].set_title('Prediction Error Distribution', fontsize=14, fontweight='bold')
        axes[1, 1].set_xlabel('Error (°C)')
        axes[1, 1].set_ylabel('Density')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.8)

        plt.tight_layout()
        plt.show()

        # 성능 요약 출력
        print("\n상세 성능 분석:")
        print(f"잔차 평균: {residuals.mean():.3f}°C")
        print(f"잔차 표준편차: {residuals.std():.3f}°C")
       # print(f"절대 오차 중앙값: {np.median(np.abs(residuals)):.3f}°C")
        #print(f"95% 예측 구간: ±{np.percentile(np.abs(residuals), 95):.3f}°C")

    def predict_future(self, days_ahead=30):
        """
        미래 기온 예측
        """
        if self.data is None:
            print("데이터가 로드되지 않았습니다.")
            return

        print(f"\n미래 {days_ahead}일 기온 예측 중...")

        # 최근 데이터 준비
        recent_data = self.data.tail(max(60, self.sequence_length + 10)).copy()
        future_predictions = []
        future_dates = []

        for day in range(days_ahead):
            # 다음 날짜
            last_date = recent_data['date'].iloc[-1]
            next_date = last_date + pd.Timedelta(days=1)
            future_dates.append(next_date)

            # Random Forest 예측
            X_rf = self.prepare_rf_features(recent_data.tail(1))
            X_rf_scaled = self.scaler_rf_features.transform(X_rf)
            rf_pred = self.rf_model.predict(X_rf_scaled)[0]

            # LSTM 예측
            lstm_sequence = recent_data.tail(self.sequence_length)
            lstm_features = lstm_sequence[self.lstm_feature_cols].values
            X_lstm = lstm_features.reshape(1, self.sequence_length, -1)
            lstm_pred_scaled = self.lstm_model.predict(X_lstm, verbose=0)
            lstm_pred = self.scaler_lstm_target.inverse_transform(lstm_pred_scaled)[0, 0]

            # 하이브리드 예측
            hybrid_pred = 0.4 * rf_pred + 0.6 * lstm_pred
            future_predictions.append(hybrid_pred)

            # 다음 예측을 위해 데이터 업데이트
            new_row = pd.DataFrame({
                'date': [next_date],
                'area': [114],  # 원주 지역 코드
                'mean_tmp': [hybrid_pred],
                'min_tmp': [hybrid_pred - 5],  # 간단한 추정
                'max_tmp': [hybrid_pred + 5]   # 간단한 추정
            })

            # 새로운 행에 필요한 특성들 계산
            new_row['year'] = next_date.year
            new_row['month'] = next_date.month
            new_row['day'] = next_date.day
            new_row['day_of_year'] = next_date.dayofyear
            new_row['day_of_week'] = next_date.dayofweek
            new_row['quarter'] = next_date.quarter

            new_row['month_sin'] = np.sin(2 * np.pi * next_date.month / 12)
            new_row['month_cos'] = np.cos(2 * np.pi * next_date.month / 12)
            new_row['day_of_year_sin'] = np.sin(2 * np.pi * next_date.dayofyear / 365)
            new_row['day_of_year_cos'] = np.cos(2 * np.pi * next_date.dayofyear / 365)
            new_row['day_of_week_sin'] = np.sin(2 * np.pi * next_date.dayofweek / 7)
            new_row['day_of_week_cos'] = np.cos(2 * np.pi * next_date.dayofweek / 7)

            new_row['temp_range'] = 10  # 추정값
            new_row['temp_mid'] = hybrid_pred

            # 데이터 추가
            recent_data = pd.concat([recent_data, new_row], ignore_index=True)

            # 이동평균 등 특성 재계산
            for window in [3, 7, 14, 30]:
                recent_data[f'mean_tmp_ma_{window}'] = recent_data['mean_tmp'].rolling(window=window, min_periods=1).mean()
                recent_data[f'temp_range_ma_{window}'] = recent_data['temp_range'].rolling(window=window, min_periods=1).mean()
                recent_data[f'min_tmp_ma_{window}'] = recent_data['min_tmp'].rolling(window=window, min_periods=1).mean()
                recent_data[f'max_tmp_ma_{window}'] = recent_data['max_tmp'].rolling(window=window, min_periods=1).mean()

            for window in [7, 14]:
                recent_data[f'mean_tmp_std_{window}'] = recent_data['mean_tmp'].rolling(window=window, min_periods=1).std()

            recent_data['temp_change_1d'] = recent_data['mean_tmp'].diff(1)
            recent_data['temp_change_3d'] = recent_data['mean_tmp'].diff(3)
            recent_data['temp_change_7d'] = recent_data['mean_tmp'].diff(7)

            monthly_mean = recent_data.groupby('month')['mean_tmp'].transform('mean')
            recent_data['temp_seasonal_anomaly'] = recent_data['mean_tmp'] - monthly_mean

            for lag in [1, 2, 3, 7, 14]:
                recent_data[f'mean_tmp_lag_{lag}'] = recent_data['mean_tmp'].shift(lag)
                recent_data[f'temp_range_lag_{lag}'] = recent_data['temp_range'].shift(lag)

            # 결측값 처리
            numeric_cols = recent_data.select_dtypes(include=[np.number]).columns
            recent_data[numeric_cols] = recent_data[numeric_cols].fillna(method='bfill').fillna(method='ffill')

            if (day + 1) % 7 == 0:
                print(f"   {day+1:2d}일 후 ({next_date.strftime('%Y-%m-%d')}): {hybrid_pred:.1f}°C")

        # 미래 예측 시각화
        plt.figure(figsize=(15, 8))

        # 최근 실제 데이터
        recent_actual = self.data.tail(60)
        plt.plot(recent_actual['date'], recent_actual['mean_tmp'],
                'b-', label='Recent Actual', linewidth=2, alpha=0.8)

        # 미래 예측 데이터
        plt.plot(future_dates, future_predictions,
                'r--', label='RF+LSTM Hybrid Prediction', linewidth=2, marker='o', markersize=4)

        plt.axvline(x=self.data['date'].iloc[-1], color='gray', linestyle=':', alpha=0.7,
                   label='Prediction Start')
        plt.xlabel('Date')
        plt.ylabel('Mean Temperature (°C)')
        plt.title(f'Wonju Temperature Prediction - RF+LSTM Hybrid Model ({days_ahead} days)',
                 fontsize=14, fontweight='bold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        print(f"\n 미래 예측 요약:")
        print(f"   평균 예측 기온: {np.mean(future_predictions):.1f}°C")
        print(f"   최고 예측 기온: {np.max(future_predictions):.1f}°C")
        print(f"   최저 예측 기온: {np.min(future_predictions):.1f}°C")
        print(f"   기온 변동 범위: {np.max(future_predictions) - np.min(future_predictions):.1f}°C")

        return future_predictions, future_dates

    def run_complete_pipeline(self, filepath):
        """
        전체 파이프라인 실행
        """
        print("Random Forest + LSTM 하이브리드 모델 파이프라인 시작")

        # 1. 데이터 로드 및 전처리
        df = self.load_and_preprocess_data(filepath)

        # 2. 데이터 분할
        train_df, val_df, test_df = self.split_data(df)

        # 3. Random Forest 훈련
        rf_metrics = self.train_random_forest(train_df, val_df)

        # 4. LSTM 훈련
        lstm_metrics = self.train_lstm(train_df, val_df)

        # 5. 훈련 과정 시각화
        self.plot_training_history()

        # 6. 하이브리드 예측
        results = self.predict_hybrid(test_df)

        # 7. 성능 평가
        evaluation = self.evaluate_predictions(results)

        # 8. 결과 시각화
        self.visualize_predictions(results)

        # 9. 미래 예측
        future_pred, future_dates = self.predict_future(days_ahead=30)

        print("\n파이프라인 완료!")

        return {
            'data': df,
            'results': results,
            'evaluation': evaluation,
            'future_predictions': future_pred,
            'future_dates': future_dates
        }




In [None]:
# 사용 예제
if __name__ == "__main__":
    # 모델 초기화
    predictor = RandomForestLSTMPredictor(
        sequence_length=30,      # LSTM 시퀀스 길이
        rf_n_estimators=100,     # Random Forest 트리 개수
        lstm_units=64           # LSTM 유닛 수
    )

    # 전체 파이프라인 실행
    # 파일 경로를 실제 경로로 변경하세요
    file_path = '/content/OBS_ASOS_wonju(10y).csv'

    try:
        pipeline_results = predictor.run_complete_pipeline(file_path)

        print("\n" + "="*60)
        print("Random Forest + LSTM 하이브리드 모델 실행 완료!")
        print("="*60)

        # 추가 분석이나 예측이 필요한 경우
        # predictor.predict_future(days_ahead=60)  # 60일 예측

    except FileNotFoundError:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        print(" 파일 경로를 확인하고 다시 시도하세요.")
    except Exception as e:
        print(f" 오류 발생: {str(e)}")


# 개별 모델 성능 비교를 위한 추가 함수들
def compare_individual_models(predictor, test_df):
    """
    개별 모델 성능 상세 비교
    """
    print("\n개별 모델 성능 상세 분석")
    print("="*50)

    # Random Forest만 사용한 예측
    X_test_rf = predictor.prepare_rf_features(test_df)
    X_test_rf_scaled = predictor.scaler_rf_features.transform(X_test_rf)
    rf_only_pred = predictor.rf_model.predict(X_test_rf_scaled)

    # LSTM만 사용한 예측
    X_test_lstm, y_test_lstm = predictor.prepare_lstm_data(test_df)
    lstm_pred_scaled = predictor.lstm_model.predict(X_test_lstm, verbose=0)
    lstm_only_pred = predictor.scaler_lstm_target.inverse_transform(lstm_pred_scaled).flatten()

    # 실제값 (전체 테스트 데이터)
    actual_full = test_df['mean_tmp'].values
    actual_lstm = predictor.scaler_lstm_target.inverse_transform(y_test_lstm).flatten()

    # Random Forest 성능 (전체 테스트 데이터)
    rf_rmse = np.sqrt(mean_squared_error(actual_full, rf_only_pred))
    rf_mae = mean_absolute_error(actual_full, rf_only_pred)
    rf_r2 = r2_score(actual_full, rf_only_pred)

    # LSTM 성능
    lstm_rmse = np.sqrt(mean_squared_error(actual_lstm, lstm_only_pred))
    lstm_mae = mean_absolute_error(actual_lstm, lstm_only_pred)
    lstm_r2 = r2_score(actual_lstm, lstm_only_pred)

    print(f"Random Forest 단독:")
    print(f"  RMSE: {rf_rmse:.3f}°C, MAE: {rf_mae:.3f}°C, R²: {rf_r2:.3f}")
    print(f"LSTM 단독:")
    print(f"  RMSE: {lstm_rmse:.3f}°C, MAE: {lstm_mae:.3f}°C, R²: {lstm_r2:.3f}")

    return {
        'rf_only': {'RMSE': rf_rmse, 'MAE': rf_mae, 'R²': rf_r2},
        'lstm_only': {'RMSE': lstm_rmse, 'MAE': lstm_mae, 'R²': lstm_r2}
    }


def analyze_seasonal_performance(results):
    """
    계절별 성능 분석
    """
    print("\n 계절별 성능 분석")
    print("="*40)

    # 날짜 정보가 있다면 계절별로 분석
    if 'test_dates' in results:
        dates = pd.to_datetime(results['test_dates'])
        months = dates.month

        seasons = {
            'Spring (3-5월)': [3, 4, 5],
            'Summer (6-8월)': [6, 7, 8],
            'Autumn (9-11월)': [9, 10, 11],
            'Winter (12-2월)': [12, 1, 2]
        }

        for season_name, season_months in seasons.items():
            mask = months.isin(season_months)
            if np.sum(mask) > 0:
                actual_season = results['actual'][mask]
                hybrid_season = results['hybrid'][mask]

                rmse = np.sqrt(mean_squared_error(actual_season, hybrid_season))
                mae = mean_absolute_error(actual_season, hybrid_season)

                print(f"{season_name:<15} RMSE: {rmse:.3f}°C  MAE: {mae:.3f}°C  ({np.sum(mask):3d}일)")


# 하이퍼파라미터 최적화를 위한 함수
def optimize_hybrid_weights(predictor, val_df, weight_range=np.arange(0.1, 1.0, 0.1)):
    """
    하이브리드 모델의 최적 가중치 찾기
    """
    print("\n하이브리드 모델 가중치 최적화")
    print("="*40)

    best_rmse = float('inf')
    best_weights = (0.5, 0.5)

    results_list = []

    for rf_weight in weight_range:
        lstm_weight = 1.0 - rf_weight

        # 검증 데이터로 예측
        val_results = predictor.predict_hybrid(val_df, rf_weight, lstm_weight)
        rmse = np.sqrt(mean_squared_error(val_results['actual'], val_results['hybrid']))

        results_list.append((rf_weight, lstm_weight, rmse))

        if rmse < best_rmse:
            best_rmse = rmse
            best_weights = (rf_weight, lstm_weight)

        print(f"RF:{rf_weight:.1f}, LSTM:{lstm_weight:.1f} -> RMSE: {rmse:.3f}°C")

    print(f"\n최적 가중치: RF {best_weights[0]:.1f}, LSTM {best_weights[1]:.1f}")
    print(f"   최적 RMSE: {best_rmse:.3f}°C")

    return best_weights, results_list