In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import copy

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 1. 라이브러리
import pandas as pd, numpy as np
from tqdm import tqdm

#──────────────────────────────────────────────────────────────
# 2. 데이터 로드 및 전처리 (관측 + 예측)
obs  = pd.read_csv('/content/drive/MyDrive/산업부/데이터_분석과제_7_기상관측데이터_2401_2503.csv')
pred = pd.read_csv('/content/drive/MyDrive/산업부/데이터_분석과제_7_기상예측데이터_2401_2503.csv')
obs.columns  = ['datetime', 'humidity_obs', 'temp_obs', 'pressure_obs']
pred.columns = ['datetime', 'solar_rad_pred', 'humidity_pred', 'absolute_humidity_pred', 'temp_pred', 'pressure_pred']

# 단위 통일 (pressure: mmHg → hPa)
obs['pressure_obs'] = obs['pressure_obs'] * 1.33322
obs['datetime']     = pd.to_datetime(obs['datetime'])
pred['datetime']    = pd.to_datetime(pred['datetime'])

df = pd.merge(obs, pred, on='datetime')

# 오차 변수 생성
df['temp_error']     = df['temp_pred']     - df['temp_obs']
df['humidity_error'] = df['humidity_pred'] - df['humidity_obs']
df['pressure_error'] = df['pressure_pred'] - df['pressure_obs']

# 시간 파생변수
df['hour']  = df['datetime'].dt.hour
df['month'] = df['datetime'].dt.month
df['day']   = df['datetime'].dt.day
df['year']  = df['datetime'].dt.year

df['hour_sin']  = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos']  = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

#──────────────────────────────────────────────────────────────
# 3. 이상치 제거
def get_outlier_indices(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series[(series < lower) | (series > upper)].index

out_idx = set(get_outlier_indices(df['pressure_error']))
out_idx |= set(get_outlier_indices(df['temp_error']))
out_idx |= set(get_outlier_indices(df['humidity_error']))

drop_outliers = df.drop(index=out_idx).reset_index(drop=True)
ori_data = df.copy()

In [4]:
# datetime 형식 변환
drop_outliers["datetime"] = pd.to_datetime(drop_outliers["datetime"])
ori_data["datetime"] = pd.to_datetime(ori_data["datetime"])

# 데이터 분할
train_df = drop_outliers[
    (drop_outliers["datetime"] >= "2024-01-01") & (drop_outliers["datetime"] < "2025-01-01")
]

# validation 1~2월 2달치 사용
val_df = ori_data[
    (ori_data["datetime"] >= "2025-01-01") & (ori_data["datetime"] < "2025-03-01")
]
test_df = ori_data[
    (ori_data["datetime"] >= "2025-03-01") & (ori_data["datetime"] < "2025-04-01")
]

In [5]:
# 입력/출력 컬럼 정의
input_cols = [
    "temp_pred", "humidity_pred", "pressure_pred",
    "absolute_humidity_pred", "solar_rad_pred",
    "hour_sin", "hour_cos", "month_sin", "month_cos",
     "month", "day","hour",
]

In [6]:
# 필요한 함수/클래스 정의
class ForecastDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class ForecastMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, output_dim)
        )
    def forward(self, x):
        return self.net(x)

class RMSE_MAE_Loss(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, pred, target):
        rmse = torch.sqrt(torch.mean((pred - target) ** 2))
        mae = torch.mean(torch.abs(pred - target))
        return (rmse + mae) / 2

def evaluate_single_target(y_true, y_pred, scaler_y, weight=1.0):
    y_true = scaler_y.inverse_transform(y_true)
    y_pred = scaler_y.inverse_transform(y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae = np.mean(np.abs(y_true - y_pred))
    score = (rmse * weight + mae * weight) / 2
    return score

In [7]:
def train_and_evaluate_model(train_df, val_df, test_df, input_cols, target_col, lr, factor):
    # 데이터 분리
    target_cols = [target_col]
    X_train, y_train = train_df[input_cols], train_df[target_cols]
    X_val, y_val     = val_df[input_cols], val_df[target_cols]
    X_test, y_test   = test_df[input_cols], test_df[target_cols]

    # 정규화
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train = scaler_X.fit_transform(X_train)
    X_val   = scaler_X.transform(X_val)
    X_test  = scaler_X.transform(X_test)
    y_train = scaler_y.fit_transform(y_train)
    y_val   = scaler_y.transform(y_val)
    y_test  = scaler_y.transform(y_test)

    # 데이터로더
    train_loader = DataLoader(ForecastDataset(X_train, y_train), batch_size=64, shuffle=True)
    val_loader   = DataLoader(ForecastDataset(X_val, y_val), batch_size=64)
    test_loader  = DataLoader(ForecastDataset(X_test, y_test), batch_size=64)

    # 모델/손실/옵티마이저/스케줄러
    model = ForecastMLP(input_dim=len(input_cols), output_dim=1)
    criterion = RMSE_MAE_Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=20, factor=factor, verbose=False)

    # 학습 루프
    num_epochs = 1000
    patience = 50
    best_val_score = float("inf")
    best_model_state = None
    epochs_no_improve = 0

    for epoch in range(1, num_epochs + 1):
        model.train()
        for xb, yb in train_loader:
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 검증
        model.eval()
        val_preds, val_trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                pred = model(xb)
                val_preds.append(pred.numpy())
                val_trues.append(yb.numpy())

        val_score = evaluate_single_target(np.vstack(val_trues), np.vstack(val_preds), scaler_y)
        scheduler.step(val_score)

        if val_score < best_val_score:
            best_val_score = val_score
            best_model_state = copy.deepcopy(model.state_dict())
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                break

    # Test 평가
    model.load_state_dict(best_model_state)
    model.eval()
    test_preds, test_trues = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            pred = model(xb)
            test_preds.append(pred.numpy())
            test_trues.append(yb.numpy())

    test_score = evaluate_single_target(np.vstack(test_trues), np.vstack(test_preds), scaler_y)
    return test_score

# 하이퍼파라미터 탐색
def search_best_hyperparams(train_df, val_df, test_df, input_cols, target_col, lr_list, factor_list):
    best_score = float("inf")
    best_params = {}
    results = []

    for lr in lr_list:
        for factor in factor_list:
            score = train_and_evaluate_model(train_df, val_df, test_df, input_cols, target_col, lr, factor)
            results.append({"target": target_col, "lr": lr, "factor": factor, "sAError": score})
            if score < best_score:
                best_score = score
                best_params = {"lr": lr, "factor": factor, "sAError": score}
            print(f"target={target_col} | lr={lr:.5f} | factor={factor:.1f} | sAError={score:.5f}")

    return pd.DataFrame(results), best_params

In [8]:
# 전체 실행
def run_full_search(train_df, val_df, test_df, input_cols):
    lr_list = [0.001, 0.0005, 0.0003, 0.0001, 0.00007, 0.00005, 0.00003, 0.00001]
    factor_list = [round(x, 1) for x in np.arange(0.3, 1.0, 0.1)]

    all_results = []
    best_params_dict = {}

    for target_col in ["temp_obs", "humidity_obs", "pressure_obs"]:
        df_result, best_params = search_best_hyperparams(
            train_df, val_df, test_df, input_cols, target_col, lr_list, factor_list
        )
        all_results.append(df_result)
        best_params_dict[target_col] = best_params

    final_df = pd.concat(all_results, ignore_index=True)
    final_df.to_csv("search_results.csv", index=False)

    # best 파라미터 저장
    pd.DataFrame.from_dict(best_params_dict, orient='index').to_csv("best_params.csv")
    return final_df, best_params_dict

In [9]:
final_results, best_param_dict = run_full_search(train_df, val_df, test_df, input_cols)



target=temp_obs | lr=0.00100 | factor=0.3 | sAError=1.38207




target=temp_obs | lr=0.00100 | factor=0.4 | sAError=1.46564




target=temp_obs | lr=0.00100 | factor=0.5 | sAError=1.45282




target=temp_obs | lr=0.00100 | factor=0.6 | sAError=1.53549




target=temp_obs | lr=0.00100 | factor=0.7 | sAError=1.46896




target=temp_obs | lr=0.00100 | factor=0.8 | sAError=1.54958




target=temp_obs | lr=0.00100 | factor=0.9 | sAError=1.44874




target=temp_obs | lr=0.00050 | factor=0.3 | sAError=1.46587




target=temp_obs | lr=0.00050 | factor=0.4 | sAError=1.47281




target=temp_obs | lr=0.00050 | factor=0.5 | sAError=1.45907




target=temp_obs | lr=0.00050 | factor=0.6 | sAError=1.48211




target=temp_obs | lr=0.00050 | factor=0.7 | sAError=1.40576




target=temp_obs | lr=0.00050 | factor=0.8 | sAError=1.44156




target=temp_obs | lr=0.00050 | factor=0.9 | sAError=1.45020




target=temp_obs | lr=0.00030 | factor=0.3 | sAError=1.32918




target=temp_obs | lr=0.00030 | factor=0.4 | sAError=1.42523




target=temp_obs | lr=0.00030 | factor=0.5 | sAError=1.53235




target=temp_obs | lr=0.00030 | factor=0.6 | sAError=1.51624




target=temp_obs | lr=0.00030 | factor=0.7 | sAError=1.45937




target=temp_obs | lr=0.00030 | factor=0.8 | sAError=1.52388




target=temp_obs | lr=0.00030 | factor=0.9 | sAError=1.46213




target=temp_obs | lr=0.00010 | factor=0.3 | sAError=1.49279




target=temp_obs | lr=0.00010 | factor=0.4 | sAError=1.35854




target=temp_obs | lr=0.00010 | factor=0.5 | sAError=1.46831




target=temp_obs | lr=0.00010 | factor=0.6 | sAError=1.34447




target=temp_obs | lr=0.00010 | factor=0.7 | sAError=1.39802




target=temp_obs | lr=0.00010 | factor=0.8 | sAError=1.57105




target=temp_obs | lr=0.00010 | factor=0.9 | sAError=1.47428




target=temp_obs | lr=0.00007 | factor=0.3 | sAError=1.47118




target=temp_obs | lr=0.00007 | factor=0.4 | sAError=1.52212




target=temp_obs | lr=0.00007 | factor=0.5 | sAError=1.40937




target=temp_obs | lr=0.00007 | factor=0.6 | sAError=1.48167




target=temp_obs | lr=0.00007 | factor=0.7 | sAError=1.45277




target=temp_obs | lr=0.00007 | factor=0.8 | sAError=1.45572




target=temp_obs | lr=0.00007 | factor=0.9 | sAError=1.49858




target=temp_obs | lr=0.00005 | factor=0.3 | sAError=1.50324




target=temp_obs | lr=0.00005 | factor=0.4 | sAError=1.46634




target=temp_obs | lr=0.00005 | factor=0.5 | sAError=1.46648




target=temp_obs | lr=0.00005 | factor=0.6 | sAError=1.42609




target=temp_obs | lr=0.00005 | factor=0.7 | sAError=1.50509




target=temp_obs | lr=0.00005 | factor=0.8 | sAError=1.35568




target=temp_obs | lr=0.00005 | factor=0.9 | sAError=1.37948




target=temp_obs | lr=0.00003 | factor=0.3 | sAError=1.42267




target=temp_obs | lr=0.00003 | factor=0.4 | sAError=1.62119




target=temp_obs | lr=0.00003 | factor=0.5 | sAError=1.38477




target=temp_obs | lr=0.00003 | factor=0.6 | sAError=1.50488




target=temp_obs | lr=0.00003 | factor=0.7 | sAError=1.38072




target=temp_obs | lr=0.00003 | factor=0.8 | sAError=1.51784




target=temp_obs | lr=0.00003 | factor=0.9 | sAError=1.42987




target=temp_obs | lr=0.00001 | factor=0.3 | sAError=1.43982




target=temp_obs | lr=0.00001 | factor=0.4 | sAError=1.48084




target=temp_obs | lr=0.00001 | factor=0.5 | sAError=1.51156




target=temp_obs | lr=0.00001 | factor=0.6 | sAError=1.59418




target=temp_obs | lr=0.00001 | factor=0.7 | sAError=1.47278




target=temp_obs | lr=0.00001 | factor=0.8 | sAError=1.51659




target=temp_obs | lr=0.00001 | factor=0.9 | sAError=1.45147




target=humidity_obs | lr=0.00100 | factor=0.3 | sAError=10.13262




target=humidity_obs | lr=0.00100 | factor=0.4 | sAError=8.98321




target=humidity_obs | lr=0.00100 | factor=0.5 | sAError=9.12175




target=humidity_obs | lr=0.00100 | factor=0.6 | sAError=8.92950




target=humidity_obs | lr=0.00100 | factor=0.7 | sAError=9.53282




target=humidity_obs | lr=0.00100 | factor=0.8 | sAError=9.68229




target=humidity_obs | lr=0.00100 | factor=0.9 | sAError=8.74517




target=humidity_obs | lr=0.00050 | factor=0.3 | sAError=9.29687




target=humidity_obs | lr=0.00050 | factor=0.4 | sAError=9.29001




target=humidity_obs | lr=0.00050 | factor=0.5 | sAError=9.61695




target=humidity_obs | lr=0.00050 | factor=0.6 | sAError=9.37066




target=humidity_obs | lr=0.00050 | factor=0.7 | sAError=9.00107




target=humidity_obs | lr=0.00050 | factor=0.8 | sAError=9.64162




target=humidity_obs | lr=0.00050 | factor=0.9 | sAError=9.25881




target=humidity_obs | lr=0.00030 | factor=0.3 | sAError=9.24933




target=humidity_obs | lr=0.00030 | factor=0.4 | sAError=9.15879




target=humidity_obs | lr=0.00030 | factor=0.5 | sAError=9.29624




target=humidity_obs | lr=0.00030 | factor=0.6 | sAError=9.38312




target=humidity_obs | lr=0.00030 | factor=0.7 | sAError=9.83311




target=humidity_obs | lr=0.00030 | factor=0.8 | sAError=9.24898




target=humidity_obs | lr=0.00030 | factor=0.9 | sAError=9.10218




target=humidity_obs | lr=0.00010 | factor=0.3 | sAError=8.90978




target=humidity_obs | lr=0.00010 | factor=0.4 | sAError=9.27101




target=humidity_obs | lr=0.00010 | factor=0.5 | sAError=9.61477




target=humidity_obs | lr=0.00010 | factor=0.6 | sAError=9.76665




target=humidity_obs | lr=0.00010 | factor=0.7 | sAError=9.21149




target=humidity_obs | lr=0.00010 | factor=0.8 | sAError=9.00414




target=humidity_obs | lr=0.00010 | factor=0.9 | sAError=9.58563




target=humidity_obs | lr=0.00007 | factor=0.3 | sAError=9.71790




target=humidity_obs | lr=0.00007 | factor=0.4 | sAError=9.49193




target=humidity_obs | lr=0.00007 | factor=0.5 | sAError=9.44413




target=humidity_obs | lr=0.00007 | factor=0.6 | sAError=9.11985




target=humidity_obs | lr=0.00007 | factor=0.7 | sAError=9.16022




target=humidity_obs | lr=0.00007 | factor=0.8 | sAError=9.46694




target=humidity_obs | lr=0.00007 | factor=0.9 | sAError=8.81136




target=humidity_obs | lr=0.00005 | factor=0.3 | sAError=9.70844




target=humidity_obs | lr=0.00005 | factor=0.4 | sAError=9.69984




target=humidity_obs | lr=0.00005 | factor=0.5 | sAError=9.07948




target=humidity_obs | lr=0.00005 | factor=0.6 | sAError=9.72158




target=humidity_obs | lr=0.00005 | factor=0.7 | sAError=9.55192




target=humidity_obs | lr=0.00005 | factor=0.8 | sAError=9.43870




target=humidity_obs | lr=0.00005 | factor=0.9 | sAError=9.99806




target=humidity_obs | lr=0.00003 | factor=0.3 | sAError=9.58692




target=humidity_obs | lr=0.00003 | factor=0.4 | sAError=9.10363




target=humidity_obs | lr=0.00003 | factor=0.5 | sAError=9.32959




target=humidity_obs | lr=0.00003 | factor=0.6 | sAError=9.43410




target=humidity_obs | lr=0.00003 | factor=0.7 | sAError=9.08614




target=humidity_obs | lr=0.00003 | factor=0.8 | sAError=9.60167




target=humidity_obs | lr=0.00003 | factor=0.9 | sAError=9.81491




target=humidity_obs | lr=0.00001 | factor=0.3 | sAError=9.05279




target=humidity_obs | lr=0.00001 | factor=0.4 | sAError=9.78399




target=humidity_obs | lr=0.00001 | factor=0.5 | sAError=9.53695




target=humidity_obs | lr=0.00001 | factor=0.6 | sAError=9.68965




target=humidity_obs | lr=0.00001 | factor=0.7 | sAError=9.62562




target=humidity_obs | lr=0.00001 | factor=0.8 | sAError=9.47740




target=humidity_obs | lr=0.00001 | factor=0.9 | sAError=9.31168




target=pressure_obs | lr=0.00100 | factor=0.3 | sAError=0.72242




target=pressure_obs | lr=0.00100 | factor=0.4 | sAError=0.78508




target=pressure_obs | lr=0.00100 | factor=0.5 | sAError=0.76861




target=pressure_obs | lr=0.00100 | factor=0.6 | sAError=0.83583




target=pressure_obs | lr=0.00100 | factor=0.7 | sAError=0.79955




target=pressure_obs | lr=0.00100 | factor=0.8 | sAError=0.80178




target=pressure_obs | lr=0.00100 | factor=0.9 | sAError=0.79998




target=pressure_obs | lr=0.00050 | factor=0.3 | sAError=0.78649




target=pressure_obs | lr=0.00050 | factor=0.4 | sAError=0.72724




target=pressure_obs | lr=0.00050 | factor=0.5 | sAError=0.84203




target=pressure_obs | lr=0.00050 | factor=0.6 | sAError=0.82915




target=pressure_obs | lr=0.00050 | factor=0.7 | sAError=0.76342




target=pressure_obs | lr=0.00050 | factor=0.8 | sAError=0.70753




target=pressure_obs | lr=0.00050 | factor=0.9 | sAError=0.80280




target=pressure_obs | lr=0.00030 | factor=0.3 | sAError=0.75470




target=pressure_obs | lr=0.00030 | factor=0.4 | sAError=0.73242




target=pressure_obs | lr=0.00030 | factor=0.5 | sAError=0.76580




target=pressure_obs | lr=0.00030 | factor=0.6 | sAError=0.72983




target=pressure_obs | lr=0.00030 | factor=0.7 | sAError=0.73521




target=pressure_obs | lr=0.00030 | factor=0.8 | sAError=0.73132




target=pressure_obs | lr=0.00030 | factor=0.9 | sAError=0.76939




target=pressure_obs | lr=0.00010 | factor=0.3 | sAError=0.66902




target=pressure_obs | lr=0.00010 | factor=0.4 | sAError=0.78054




target=pressure_obs | lr=0.00010 | factor=0.5 | sAError=0.80494




target=pressure_obs | lr=0.00010 | factor=0.6 | sAError=0.81257




target=pressure_obs | lr=0.00010 | factor=0.7 | sAError=0.85187




target=pressure_obs | lr=0.00010 | factor=0.8 | sAError=0.90231




target=pressure_obs | lr=0.00010 | factor=0.9 | sAError=0.86601




target=pressure_obs | lr=0.00007 | factor=0.3 | sAError=0.81318




target=pressure_obs | lr=0.00007 | factor=0.4 | sAError=0.73167




target=pressure_obs | lr=0.00007 | factor=0.5 | sAError=0.73716




target=pressure_obs | lr=0.00007 | factor=0.6 | sAError=0.74179




target=pressure_obs | lr=0.00007 | factor=0.7 | sAError=0.69468




target=pressure_obs | lr=0.00007 | factor=0.8 | sAError=0.75674




target=pressure_obs | lr=0.00007 | factor=0.9 | sAError=0.78822




target=pressure_obs | lr=0.00005 | factor=0.3 | sAError=0.83780




target=pressure_obs | lr=0.00005 | factor=0.4 | sAError=0.78358




target=pressure_obs | lr=0.00005 | factor=0.5 | sAError=0.89449




target=pressure_obs | lr=0.00005 | factor=0.6 | sAError=0.79326




target=pressure_obs | lr=0.00005 | factor=0.7 | sAError=0.78097




target=pressure_obs | lr=0.00005 | factor=0.8 | sAError=0.80755




target=pressure_obs | lr=0.00005 | factor=0.9 | sAError=0.82047




target=pressure_obs | lr=0.00003 | factor=0.3 | sAError=0.69500




target=pressure_obs | lr=0.00003 | factor=0.4 | sAError=0.79786




target=pressure_obs | lr=0.00003 | factor=0.5 | sAError=0.76584




target=pressure_obs | lr=0.00003 | factor=0.6 | sAError=0.86658




target=pressure_obs | lr=0.00003 | factor=0.7 | sAError=0.74778




target=pressure_obs | lr=0.00003 | factor=0.8 | sAError=0.76845




target=pressure_obs | lr=0.00003 | factor=0.9 | sAError=0.79542




target=pressure_obs | lr=0.00001 | factor=0.3 | sAError=0.85633




target=pressure_obs | lr=0.00001 | factor=0.4 | sAError=0.76813




target=pressure_obs | lr=0.00001 | factor=0.5 | sAError=0.73355




target=pressure_obs | lr=0.00001 | factor=0.6 | sAError=0.82002




target=pressure_obs | lr=0.00001 | factor=0.7 | sAError=0.78314




target=pressure_obs | lr=0.00001 | factor=0.8 | sAError=0.74218




target=pressure_obs | lr=0.00001 | factor=0.9 | sAError=0.79467
