<a href="https://colab.research.google.com/github/jylee2930/Basic_BIgDataAnalysis/blob/main/Teampreture(LSTM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# TensorFlow 업데이트
!pip install --upgrade tensorflow

# 런타임 재시작 후 확인
import tensorflow as tf
print("TensorFlow 버전:", tf.__version__)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

# 한글 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

# GPU 설정 (있는 경우)
print("GPU 사용 가능:", tf.config.list_physical_devices('GPU'))
if tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)

class TemperatureLSTMPredictor:
    def __init__(self, sequence_length=30, prediction_horizon=1):
        """
        LSTM 기온 예측 모델

        Args:
            sequence_length (int): 과거 몇 일의 데이터를 사용할지
            prediction_horizon (int): 몇 일 후를 예측할지
        """
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.model = None
        self.scaler_X = None
        self.scaler_y = None
        self.history = None

    def load_and_preprocess_data(self, file_path):
        """
        데이터 로드 및 전처리
        """
        # CSV 파일 읽기
        df = pd.read_csv('/content/wonju_temp.csv')

        # 날짜 컬럼 정리
        df['date'] = df['date'].str.replace('\t', '').str.strip()
        df = df.dropna(subset=['date', 'mean_tmp', 'min_tmp', 'max_tmp'])
        df = df[df['date'] != '']
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)

        print(f"데이터 로드 완료: {len(df)}일의 기온 데이터")
        print(f"데이터 기간: {df['date'].min().strftime('%Y-%m-%d')} ~ {df['date'].max().strftime('%Y-%m-%d')}")

        return df

    def create_features(self, df):
        """
        LSTM을 위한 특성 생성
        """
        df = df.copy()

        # 기본 날짜 특성
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['day_of_year'] = df['date'].dt.dayofyear
        df['day_of_week'] = df['date'].dt.dayofweek
        df['quarter'] = df['date'].dt.quarter

        # 계절성 특성 (순환적)
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
        df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
        df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

        # 기온 관련 특성
        df['temp_range'] = df['max_tmp'] - df['min_tmp']
        df['temp_mid'] = (df['max_tmp'] + df['min_tmp']) / 2

        # 이동평균 특성
        for window in [3, 7, 14, 30]:
            df[f'mean_tmp_ma_{window}'] = df['mean_tmp'].rolling(window=window, min_periods=1).mean()
            df[f'temp_range_ma_{window}'] = df['temp_range'].rolling(window=window, min_periods=1).mean()

        # 이동표준편차
        for window in [7, 14]:
            df[f'mean_tmp_std_{window}'] = df['mean_tmp'].rolling(window=window, min_periods=1).std()

        # 기온 변화율
        df['temp_change_1d'] = df['mean_tmp'].diff(1)
        df['temp_change_7d'] = df['mean_tmp'].diff(7)

        # 계절별 이상값
        monthly_mean = df.groupby('month')['mean_tmp'].transform('mean')
        df['temp_seasonal_anomaly'] = df['mean_tmp'] - monthly_mean

        # 결측값 처리
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(method='bfill').fillna(method='ffill')

        print(f"특성 생성 완료: 총 {len(df.columns)}개 특성")
        return df

    def prepare_lstm_data(self, df, target_col='mean_tmp'):
        """
        LSTM을 위한 시퀀스 데이터 준비
        """
        # 특성 선택 (날짜, area 제외)
        feature_cols = [col for col in df.columns if col not in ['date', 'area']]

        # 데이터 추출
        data = df[feature_cols].values

        # 타겟 컬럼 인덱스 찾기
        target_idx = df.columns.get_loc(target_col)

        # 정규화
        self.scaler_X = MinMaxScaler()
        self.scaler_y = MinMaxScaler()

        # 전체 특성 정규화
        data_scaled = self.scaler_X.fit_transform(data)

        # 타겟만 따로 정규화
        target_data = df[target_col].values.reshape(-1, 1)
        target_scaled = self.scaler_y.fit_transform(target_data)

        # 시퀀스 데이터 생성
        X, y = [], []

        for i in range(self.sequence_length, len(data_scaled) - self.prediction_horizon + 1):
            # 과거 sequence_length 일의 모든 특성
            X.append(data_scaled[i-self.sequence_length:i])
            # prediction_horizon 일 후의 타겟
            y.append(target_scaled[i + self.prediction_horizon - 1])

        X = np.array(X)
        y = np.array(y)

        print(f"LSTM 데이터 준비 완료:")
        print(f"X shape: {X.shape} (samples, timesteps, features)")
        print(f"y shape: {y.shape}")

        return X, y, feature_cols

    def split_data(self, X, y, train_ratio=0.7, val_ratio=0.15):
        """
        시계열 데이터 분할 (시간 순서 유지)
        """
        n_samples = len(X)
        train_size = int(n_samples * train_ratio)
        val_size = int(n_samples * val_ratio)

        X_train = X[:train_size]
        y_train = y[:train_size]

        X_val = X[train_size:train_size + val_size]
        y_val = y[train_size:train_size + val_size]

        X_test = X[train_size + val_size:]
        y_test = y[train_size + val_size:]

        print(f"데이터 분할 완료:")
        print(f"훈련: {len(X_train)}, 검증: {len(X_val)}, 테스트: {len(X_test)}")

        return X_train, X_val, X_test, y_train, y_val, y_test

    def build_model(self, input_shape):
        """
        LSTM 모델 구축
        """
        model = Sequential([
            # 첫 번째 LSTM 레이어
            LSTM(128, return_sequences=True, input_shape=input_shape, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # 두 번째 LSTM 레이어
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # 세 번째 LSTM 레이어
            LSTM(32, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),

            # Dense 레이어들
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1)  # 출력 레이어
        ])

        # 컴파일
        optimizer = Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

        print("\n=== 모델 구조 ===")
        model.summary()

        return model

    def train_model(self, X_train, y_train, X_val, y_val, epochs=200, batch_size=32):
        """
        모델 학습
        """
        # 콜백 설정
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7, verbose=1),
            ModelCheckpoint('best_lstm_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
        ]

        print(f"\n모델 학습 시작 (최대 {epochs} 에폭)...")

        # 모델 학습
        self.history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )

        print("모델 학습 완료!")

        return self.history

    def evaluate_model(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """
        모델 성능 평가
        """
        # 예측
        y_pred_train = self.model.predict(X_train, verbose=0)
        y_pred_val = self.model.predict(X_val, verbose=0)
        y_pred_test = self.model.predict(X_test, verbose=0)

        # 역정규화
        y_train_orig = self.scaler_y.inverse_transform(y_train)
        y_val_orig = self.scaler_y.inverse_transform(y_val)
        y_test_orig = self.scaler_y.inverse_transform(y_test)

        y_pred_train_orig = self.scaler_y.inverse_transform(y_pred_train)
        y_pred_val_orig = self.scaler_y.inverse_transform(y_pred_val)
        y_pred_test_orig = self.scaler_y.inverse_transform(y_pred_test)

        # 성능 계산
        def calculate_metrics(y_true, y_pred):
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            mae = mean_absolute_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)
            return rmse, mae, r2

        train_rmse, train_mae, train_r2 = calculate_metrics(y_train_orig, y_pred_train_orig)
        val_rmse, val_mae, val_r2 = calculate_metrics(y_val_orig, y_pred_val_orig)
        test_rmse, test_mae, test_r2 = calculate_metrics(y_test_orig, y_pred_test_orig)

        print("\n=== 모델 성능 평가 ===")
        print(f"훈련   - RMSE: {train_rmse:.3f}°C, MAE: {train_mae:.3f}°C, R²: {train_r2:.3f}")
        print(f"검증   - RMSE: {val_rmse:.3f}°C, MAE: {val_mae:.3f}°C, R²: {val_r2:.3f}")
        print(f"테스트 - RMSE: {test_rmse:.3f}°C, MAE: {test_mae:.3f}°C, R²: {test_r2:.3f}")

        return {
            'train': (y_train_orig, y_pred_train_orig),
            'val': (y_val_orig, y_pred_val_orig),
            'test': (y_test_orig, y_pred_test_orig)
        }

    def plot_training_history(self):
        """
        학습 과정 시각화
        """
        if self.history is None:
            print("학습 기록이 없습니다.")
            return

        fig, axes = plt.subplots(1, 2, figsize=(15, 5))

        # Loss 그래프
        axes[0].plot(self.history.history['loss'], label='Training Loss', linewidth=2)
        axes[0].plot(self.history.history['val_loss'], label='Validation Loss', linewidth=2)
        axes[0].set_title('Model Loss', fontsize=14)
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Loss (MSE)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)

        # MAE 그래프
        axes[1].plot(self.history.history['mae'], label='Training MAE', linewidth=2)
        axes[1].plot(self.history.history['val_mae'], label='Validation MAE', linewidth=2)
        axes[1].set_title('Model MAE', fontsize=14)
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('MAE')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    def visualize_predictions(self, results, df_dates, train_size, val_size):
        """
        예측 결과 시각화
        """
        fig, axes = plt.subplots(2, 2, figsize=(18, 12))

        # 1. 전체 시계열 예측 결과
        train_dates = df_dates[self.sequence_length:self.sequence_length + train_size]
        val_dates = df_dates[self.sequence_length + train_size:self.sequence_length + train_size + val_size]
        test_dates = df_dates[self.sequence_length + train_size + val_size:]

        y_train_orig, y_pred_train_orig = results['train']
        y_val_orig, y_pred_val_orig = results['val']
        y_test_orig, y_pred_test_orig = results['test']

        axes[0, 0].plot(train_dates, y_train_orig.flatten(), label='train', alpha=0.7, linewidth=1)
        axes[0, 0].plot(val_dates, y_val_orig.flatten(), label='validation', alpha=0.8, linewidth=1.5)
        axes[0, 0].plot(test_dates, y_test_orig.flatten(), label='test', linewidth=2)
        axes[0, 0].plot(test_dates, y_pred_test_orig.flatten(), label='predict(test)',
                       linestyle='--', linewidth=2)
        axes[0, 0].set_title('Time Series Predict Result', fontsize=14)
        axes[0, 0].set_xlabel('date')
        axes[0, 0].set_ylabel('Mean_temp (°C)')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # 2. 실제 vs 예측 (테스트 데이터)
        axes[0, 1].scatter(y_test_orig, y_pred_test_orig, alpha=0.6, s=20)
        min_val, max_val = y_test_orig.min(), y_test_orig.max()
        axes[0, 1].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
        axes[0, 1].set_title('Real vs Predict (Test)', fontsize=14)
        axes[0, 1].set_xlabel('Real_mean_temp (°C)')
        axes[0, 1].set_ylabel('Predict_mean_temp (°C)')
        axes[0, 1].grid(True, alpha=0.3)

        # 3. 잔차 분석
        residuals = y_test_orig.flatten() - y_pred_test_orig.flatten()
        axes[1, 0].scatter(y_pred_test_orig, residuals, alpha=0.6, s=20, color='green')
        axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
        axes[1, 0].set_title('Residual Analysis', fontsize=14)
        axes[1, 0].set_xlabel('Predict Mean temp (°C)')
        axes[1, 0].set_ylabel('resudual (real-predict)')
        axes[1, 0].grid(True, alpha=0.3)

        # 4. 최근 예측 결과 (상세)
        recent_period = min(60, len(test_dates))
        axes[1, 1].plot(test_dates[-recent_period:], y_test_orig[-recent_period:].flatten(),
                       label='real', linewidth=2, marker='o', markersize=3)
        axes[1, 1].plot(test_dates[-recent_period:], y_pred_test_orig[-recent_period:].flatten(),
                       label='predict', linewidth=2, marker='s', markersize=3, linestyle='--')
        axes[1, 1].set_title('Predict Result', fontsize=14)
        axes[1, 1].set_xlabel('date')
        axes[1, 1].set_ylabel('Mean-temp (°C)')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        plt.setp(axes[1, 1].xaxis.get_majorticklabels(), rotation=45)

        plt.tight_layout()
        plt.show()

        # 성능 요약
        print("\n=== 상세 성능 분석 ===")
        print(f"잔차 평균: {residuals.mean():.3f}°C")
        print(f"잔차 표준편차: {residuals.std():.3f}°C")
        print(f"절대 오차 중앙값: {np.median(np.abs(residuals)):.3f}°C")
        print(f"95% 예측 구간: ±{np.percentile(np.abs(residuals), 95):.3f}°C")

    def predict_future(self, df, days_ahead=30):
        """
        미래 기온 예측
        """
        print(f"\n미래 {days_ahead}일간 기온 예측 중...")

        # 최근 데이터 가져오기
        recent_data = df.tail(self.sequence_length + days_ahead).copy()

        predictions = []
        prediction_dates = []

        for i in range(days_ahead):
            # 다음 날짜
            last_date = recent_data['date'].iloc[-1]
            next_date = last_date + pd.Timedelta(days=1)
            prediction_dates.append(next_date)

            # 예측을 위한 시퀀스 데이터 준비
            sequence_data = recent_data.tail(self.sequence_length)

            # 특성 생성
            sequence_data_with_features = self.create_features(sequence_data)
            feature_cols = [col for col in sequence_data_with_features.columns if col not in ['date', 'area']]

            # 정규화
            sequence_scaled = self.scaler_X.transform(sequence_data_with_features[feature_cols].values)
            X_pred = sequence_scaled.reshape(1, self.sequence_length, -1)

            # 예측
            pred_scaled = self.model.predict(X_pred, verbose=0)
            pred_temp = self.scaler_y.inverse_transform(pred_scaled)[0, 0]
            predictions.append(pred_temp)

            # 다음 예측을 위해 데이터 업데이트
            new_row = pd.DataFrame({
                'date': [next_date],
                'area': [114],  # 원주 지역 코드
                'mean_tmp': [pred_temp],
                'min_tmp': [pred_temp - 5],  # 간단한 추정
                'max_tmp': [pred_temp + 5]   # 간단한 추정
            })

            recent_data = pd.concat([recent_data, new_row], ignore_index=True)

            if i % 7 == 0:
                print(f"{i+1}일 후 ({next_date.strftime('%Y-%m-%d')}): {pred_temp:.1f}°C")

        # 예측 결과 시각화
        plt.figure(figsize=(15, 8))

        # 최근 실제 데이터
        recent_actual = df.tail(60)
        plt.plot(recent_actual['date'], recent_actual['mean_tmp'],
                'b-', label='Reeal temp', linewidth=2, alpha=0.8)

        # 예측 데이터
        plt.plot(prediction_dates, predictions,
                'r--', label='LSTM Predict', linewidth=2, marker='o', markersize=4)

        plt.axvline(x=df['date'].iloc[-1], color='gray', linestyle=':', alpha=0.7, label='예측 시작점')
        plt.xlabel('date')
        plt.ylabel('Mean_temp (°C)')
        plt.title(f'Wonju LSTM Predict ({days_ahead}day)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        print(f"\n예측 요약:")
        print(f"평균 예측 기온: {np.mean(predictions):.1f}°C")
        print(f"최고 예측 기온: {np.max(predictions):.1f}°C")
        print(f"최저 예측 기온: {np.min(predictions):.1f}°C")

        return predictions, prediction_dates

    def fit(self, file_path, epochs=200, batch_size=32):
        """
        전체 파이프라인 실행
        """
        print("=== 원주 기온 LSTM 예측 시스템 ===\n")

        # 1. 데이터 로드 및 전처리
        df = self.load_and_preprocess_data(file_path)
        df_with_features = self.create_features(df)

        # 2. LSTM 데이터 준비
        X, y, feature_cols = self.prepare_lstm_data(df_with_features)

        # 3. 데이터 분할
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data(X, y)

        # 4. 모델 구축
        self.model = self.build_model((X.shape[1], X.shape[2]))

        # 5. 모델 학습
        self.train_model(X_train, y_train, X_val, y_val, epochs, batch_size)

        # 6. 학습 과정 시각화
        self.plot_training_history()

        # 7. 성능 평가 및 시각화
        results = self.evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test)

        # 8. 예측 결과 시각화
        df_dates = df_with_features['date'].values
        self.visualize_predictions(results, df_dates, len(X_train), len(X_val))

        # 9. 미래 예측
        predictions, dates = self.predict_future(df_with_features, days_ahead=30)

        return df_with_features, results, predictions, dates



In [None]:
# 사용 예제
def main():
    """
    메인 실행 함수
    """
    # LSTM 예측기 초기화
    predictor = TemperatureLSTMPredictor(
        sequence_length=30,      # 과거 30일 데이터 사용
        prediction_horizon=1     # 1일 후 예측
    )

    # 모델 학습 및 예측
    df, results, predictions, dates = predictor.fit(
        'wonju_temp.csv',
        epochs=30,              # 학습 에폭 수
        batch_size=32           # 배치 크기
    )

    return predictor, df, results, predictions, dates

# 실행
if __name__ == "__main__":
    # 실행하기 전에 필요한 라이브러리 설치:
    # pip install tensorflow pandas numpy matplotlib seaborn scikit-learn

    predictor, df, results, predictions, dates = main()