In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
############### 코드 실행 전 필히 아래 경로 설정을 해주시길 바랍니다.####################
#--------------------------------------------------------
#                <학습용 데이터 경로 설정>
#--------------------------------------------------------

# 기상관측데이터 경로(train)
obs_path='./데이터_분석과제_7_기상관측데이터_2401_2503.csv'
# 기상예측데이터 경로(train)
pred_path='./데이터_분석과제_7_기상예측데이터_2401_2503.csv'



#--------------------------------------------------------
#               <검증(test)용 데이터 경로 설정>
#--------------------------------------------------------

# 기상 예측 데이터 경로(test)
test_input_path='./pred_dummy.csv'

#################################################################################


#────────────────────────────
# 1. Seed 고정
torch.manual_seed(42)
np.random.seed(42)

#────────────────────────────
# 2. 데이터 로드 및 전처리 (관측 + 예측)
test_input=f'{test_input_path}'
obs  = pd.read_csv(obs_path)
pred = pd.read_csv(pred_path)
obs.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_obs',
                    '기온(degC)':'temp_obs',
                    '대기압(mmHg)':'pressure_obs'}, inplace = True)
pred.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_pred',
                    '기온(degC)':'temp_pred',
                    '대기압(hPa)':'pressure_pred',
                    '절대습도':'absolute_humidity_pred',
                    '일사량(w/m^2)':'solar_rad_pred'}, inplace = True)

# 단위 통일 (pressure: mmHg → hPa)
obs['pressure_obs'] = obs['pressure_obs'] * 1.33322
obs['datetime']     = pd.to_datetime(obs['datetime'])
pred['datetime']    = pd.to_datetime(pred['datetime'])

df = pd.merge(obs, pred, on='datetime')

# 오차 변수 생성
df['temp_error']     = df['temp_pred']     - df['temp_obs']
df['humidity_error'] = df['humidity_pred'] - df['humidity_obs']
df['pressure_error'] = df['pressure_pred'] - df['pressure_obs']

# 시간 파생변수
df['hour']  = df['datetime'].dt.hour
df['month'] = df['datetime'].dt.month
df['day']   = df['datetime'].dt.day
df['year']  = df['datetime'].dt.year
df['hour_sin']  = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos']  = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

#────────────────────────────
# 3. 이상치 제거 (train 구간만)
def get_outlier_indices(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series[(series < lower) | (series > upper)].index

train_range = ("2024-01-01", "2024-12-31")
val_range   = ("2025-01-01", "2025-03-31")

train_full = df.query(f"'{train_range[0]}' <= datetime <= '{train_range[1]}'")
out_idx = set(get_outlier_indices(train_full['pressure_error']))
out_idx |= set(get_outlier_indices(train_full['temp_error']))
out_idx |= set(get_outlier_indices(train_full['humidity_error']))

drop_out = train_full.drop(index=out_idx).reset_index(drop=True)
val = df.query(f"'{val_range[0]}' <= datetime <= '{val_range[1]}'").reset_index(drop=True)

#────────────────────────────
# 4. 피처셋
input_cols = [
    "temp_pred", "humidity_pred", "pressure_pred",
    "absolute_humidity_pred", "solar_rad_pred",
    "hour_sin", "hour_cos", "month_sin", "month_cos",
    "month", "day", "hour",
]
#────────────────────────────
# 5. Dataset & Model 정의
class ForecastDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class ForecastMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, output_dim)
        )
    def forward(self, x):
        return self.net(x)

class RMSE_MAE_Loss(nn.Module):
    def forward(self, pred, target):
        rmse = torch.sqrt(torch.mean((pred - target) ** 2))
        mae = torch.mean(torch.abs(pred - target))
        return (rmse + mae) / 2

def evaluate_single_target(y_true, y_pred, scaler_y):
    y_true = scaler_y.inverse_transform(y_true)
    y_pred = scaler_y.inverse_transform(y_pred)
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae = np.mean(np.abs(y_true - y_pred))
    return (rmse + mae) / 2

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                scaler_y=None, num_epochs=1000, patience=50, verbose=True):
    best_val_score = float("inf")
    best_model_state = None
    epochs_no_improve = 0
    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0
        for xb, yb in train_loader:
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_loss = running_loss / len(train_loader)
        # 검증
        model.eval()
        val_preds, val_trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                pred = model(xb)
                val_preds.append(pred.numpy())
                val_trues.append(yb.numpy())
        val_score = evaluate_single_target(np.vstack(val_trues), np.vstack(val_preds), scaler_y)
        scheduler.step(val_score)
        if val_score < best_val_score:
            best_val_score = val_score
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                break
    return best_model_state

def run_forecast_pipeline(target_col, lr, sched_patience, factor,
                          hidden_dim=128, train_patience=50, batch_size=64):
    # 1. 입력/출력 정의
    X_train = drop_out[input_cols].copy()
    X_val   = val[input_cols].copy()
    y_train = drop_out[[target_col]].copy()
    y_val   = val[[target_col]].copy()
    # 2. 정규화
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train = scaler_X.fit_transform(X_train)
    X_val   = scaler_X.transform(X_val)
    y_train = scaler_y.fit_transform(y_train)
    y_val   = scaler_y.transform(y_val)
    # 3. Dataloader
    train_loader = DataLoader(ForecastDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(ForecastDataset(X_val, y_val), batch_size=batch_size)
    # 4. 모델 구성
    model = ForecastMLP(input_dim=len(input_cols), hidden_dim=hidden_dim, output_dim=1)
    criterion = RMSE_MAE_Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
        patience=sched_patience, factor=factor
    )
    # 5. 학습
    best_model_state = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                                   scaler_y=scaler_y, patience=train_patience)
    model.load_state_dict(best_model_state)
    model.eval()
    # 6. 전체 데이터 scaler 저장 (test/제출에 재사용)
    return model, scaler_X, scaler_y

#────────────────────────────
# 6. [★제출용★] 예측 데이터 기반 추론 (관측데이터 X, test set only)
def make_submission(test_pred_path, output_csv_path):
    # 1. 모델별 학습 (train/val split)
    temp_model, temp_scalerX, temp_scalerY = run_forecast_pipeline("temp_obs",     lr=0.0001,  sched_patience=20, factor=0.3)
    humi_model, humi_scalerX, humi_scalerY = run_forecast_pipeline("humidity_obs", lr=0.00001, sched_patience=20, factor=0.9)
    pres_model, pres_scalerX, pres_scalerY = run_forecast_pipeline("pressure_obs", lr=0.00003, sched_patience=20, factor=0.4)

    # 2. 예보(test) 데이터 로드 및 파생변수 생성
    test_pred = pd.read_csv(test_pred_path)
    test_pred.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_pred',
                    '기온(degC)':'temp_pred',
                    '대기압(hPa)':'pressure_pred',
                    '절대습도':'absolute_humidity_pred',
                    '일사량(w/m^2)':'solar_rad_pred'}, inplace = True)
    test_pred['datetime'] = pd.to_datetime(test_pred['datetime'])
    test_pred['hour']  = test_pred['datetime'].dt.hour
    test_pred['month'] = test_pred['datetime'].dt.month
    test_pred['day']   = test_pred['datetime'].dt.day
    test_pred['year']  = test_pred['datetime'].dt.year
    test_pred['hour_sin']  = np.sin(2 * np.pi * test_pred['hour'] / 24)
    test_pred['hour_cos']  = np.cos(2 * np.pi * test_pred['hour'] / 24)
    test_pred['month_sin'] = np.sin(2 * np.pi * test_pred['month'] / 12)
    test_pred['month_cos'] = np.cos(2 * np.pi * test_pred['month'] / 12)

    # 3. 예측
    def get_pred(model, scalerX, scalerY):
        X = scalerX.transform(test_pred[input_cols])
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            pred = model(X_tensor).numpy()
        return scalerY.inverse_transform(pred).flatten()

    temp_pred = get_pred(temp_model, temp_scalerX, temp_scalerY)
    humi_pred = get_pred(humi_model, humi_scalerX, humi_scalerY)
    pres_pred = get_pred(pres_model, pres_scalerX, pres_scalerY)
    # pressure: hPa → mmHg 변환 (제출 요구)
    pres_pred_mmHg = pres_pred / 1.33322

    # 4. 제출 파일 생성
    submit = pd.DataFrame({
        "datetime": test_pred["datetime"],
        "temp": temp_pred,
        "humidity": humi_pred,
        "pressure": pres_pred_mmHg,
    })
    submit.to_csv(output_csv_path, index=False)
    print(f"✅ 제출 파일 저장 완료: {output_csv_path}")

make_submission(f'{test_input}', './mlp_output.csv')

✅ 제출 파일 저장 완료: ./mlp_output.csv
