In [3]:
import pandas as pd, numpy as np, xgboost as xgb
from tqdm import tqdm
############### 코드 실행 전 필히 아래 경로 설정을 해주시길 바랍니다.####################
#--------------------------------------------------------
#                <학습용 데이터 경로 설정>
#--------------------------------------------------------

# 기상관측데이터 경로(train)
obs_path='./데이터_분석과제_7_기상관측데이터_2401_2503.csv'
# 기상예측데이터 경로(train)
pred_path='./데이터_분석과제_7_기상예측데이터_2401_2503.csv'



#--------------------------------------------------------
#               <검증(test)용 데이터 경로 설정>
#--------------------------------------------------------

# 기상 예측 데이터 경로(test)
test_input_path='./pred_dummy.csv'


#################################################################################
obs  = pd.read_csv(f'{obs_path}')
pred = pd.read_csv(f'{pred_path}')
obs.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_obs',
                    '기온(degC)':'temp_obs',
                    '대기압(mmHg)':'pressure_obs'}, inplace = True)
pred.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_pred',
                    '기온(degC)':'temp_pred',
                    '대기압(hPa)':'pressure_pred',
                    '절대습도':'absolute_humidity_pred',
                    '일사량(w/m^2)':'solar_rad_pred'}, inplace = True)

# 단위 통일 (pressure: mmHg → hPa)
obs['pressure_obs'] = obs['pressure_obs'] * 1.33322
obs['datetime']     = pd.to_datetime(obs['datetime'])
pred['datetime']    = pd.to_datetime(pred['datetime'])

df = pd.merge(obs, pred, on='datetime')

# 오차 변수 생성
df['temp_error']     = df['temp_pred']     - df['temp_obs']
df['humidity_error'] = df['humidity_pred'] - df['humidity_obs']
df['pressure_error'] = df['pressure_pred'] - df['pressure_obs']

# 시간 파생변수
df['hour']  = df['datetime'].dt.hour
df['month'] = df['datetime'].dt.month
df['day']   = df['datetime'].dt.day
df['year']  = df['datetime'].dt.year

df['hour_sin']  = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos']  = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

#──────────────────────────────────────────────────────────────
# 3. 이상치 제거 (train만 적용)
def get_outlier_indices(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series[(series < lower) | (series > upper)].index

# train 기간만 필터
train_range = ("2024-01-01", "2024-12-31")
val_range   = ("2025-01-01", "2025-03-31")

train_full = df.query(f"'{train_range[0]}' <= datetime <= '{train_range[1]}'")
out_idx = set(get_outlier_indices(train_full['pressure_error']))
out_idx |= set(get_outlier_indices(train_full['temp_error']))
out_idx |= set(get_outlier_indices(train_full['humidity_error']))

drop_out = train_full.drop(index=out_idx).reset_index(drop=True)
val = df.query(f"'{val_range[0]}' <= datetime <= '{val_range[1]}'").reset_index(drop=True)

#──────────────────────────────────────────────────────────────
# 4. 피처셋
targets = ["humidity_error", "pressure_error", "temp_error"]
drop_cols = ["datetime","humidity_obs","pressure_obs","temp_obs"] + targets
all_feats = [c for c in df.columns if c not in drop_cols]

sincos = ["hour_sin","hour_cos","month_sin","month_cos"]
feature_sets = {
    "all_features": all_feats,
    "drop_sincos" : [c for c in all_feats if c not in sincos],
}
target_fs_map = {
    "humidity_error": "all_features",
    "temp_error"    : "all_features",
    "pressure_error": "drop_sincos",
}

#──────────────────────────────────────────────────────────────
# 5. 확정 파라미터 (Grid Search 결과)
best_params = {
    "humidity_error": dict(max_depth=2, learning_rate=0.05, subsample=0.4,
                            colsample_bytree=1.0, n_estimators=100,
                            reg_lambda=0, reg_alpha=0.0),
    "temp_error":     dict(max_depth=4, learning_rate=0.10, subsample=0.4,
                            colsample_bytree=0.8, n_estimators=100,
                            reg_lambda=10, reg_alpha=0.5),
    "pressure_error": dict(max_depth=3, learning_rate=0.05, subsample=0.4,
                            colsample_bytree=1.0, n_estimators=100,
                            reg_lambda=0,  reg_alpha=0.0),
}

tree_m = "gpu_hist"
pred_m = "gpu_predictor"

#──────────────────────────────────────────────────────────────
# 6. 모델 학습 (TRAIN만 사용)
final_models = {}
for tgt, fs_name in target_fs_map.items():
    feats = feature_sets[fs_name]
    dtr = xgb.DMatrix(drop_out[feats], label=drop_out[tgt])
    bst = xgb.train({**best_params[tgt], "tree_method":tree_m,
                     "predictor":pred_m, "verbosity":0},
                    dtr, num_boost_round=best_params[tgt]["n_estimators"])
    final_models[tgt] = bst
    print(f"✅ Finished {tgt}")

#──────────────────────────────────────────────────────────────
# 7. [★제출용★] 예측 데이터 기반 추론 (관측데이터 X, test set only)
def make_submission(test_pred_path, output_csv_path):
    # 1. 예측 데이터 로드
    test_pred = pd.read_csv(test_pred_path)
    test_pred.rename(columns={'기상관측일시':'datetime',
                    '습도(%)':'humidity_pred',
                    '기온(degC)':'temp_pred',
                    '대기압(hPa)':'pressure_pred',
                    '절대습도':'absolute_humidity_pred',
                    '일사량(w/m^2)':'solar_rad_pred'}, inplace = True)
    test_pred['datetime'] = pd.to_datetime(test_pred['datetime'])

    # 2. 파생변수 추가
    test_pred['hour']  = test_pred['datetime'].dt.hour
    test_pred['month'] = test_pred['datetime'].dt.month
    test_pred['day']   = test_pred['datetime'].dt.day
    test_pred['year']  = test_pred['datetime'].dt.year
    test_pred['hour_sin']  = np.sin(2 * np.pi * test_pred['hour'] / 24)
    test_pred['hour_cos']  = np.cos(2 * np.pi * test_pred['hour'] / 24)
    test_pred['month_sin'] = np.sin(2 * np.pi * test_pred['month'] / 12)
    test_pred['month_cos'] = np.cos(2 * np.pi * test_pred['month'] / 12)

    # 3. 예측(오차) → 관측치 복원
    for tgt, fs_name in target_fs_map.items():
        feats = feature_sets[fs_name]
        err_pred = final_models[tgt].predict(xgb.DMatrix(test_pred[feats]))
        test_pred[f"{tgt}_pred"] = err_pred

    test_pred["humidity_obs_pred"] = test_pred["humidity_pred"] - test_pred["humidity_error_pred"]
    test_pred["temp_obs_pred"]     = test_pred["temp_pred"]     - test_pred["temp_error_pred"]
    # pressure는 mmHg 변환해서 제출
    test_pred["pressure_obs_pred_mmHg"] = (test_pred["pressure_pred"] - test_pred["pressure_error_pred"]) / 1.33322

    # 4. 제출 파일 생성: datetime, temp, humidity, pressure(mmHg)
    submit = test_pred[["datetime", "temp_obs_pred", "humidity_obs_pred", "pressure_obs_pred_mmHg"]].copy()
    submit.rename(columns={
                    "humidity_obs_pred":'humidity',
                    "temp_obs_pred":'temp',
                    "pressure_obs_pred_mmHg":'pressure'}, inplace = True)
    submit.to_csv(output_csv_path, index=False)
    print(f"✅ 제출 파일 저장 완료: {output_csv_path}")


make_submission(f"{test_input_path}", "./xgboost_output.csv")


✅ Finished humidity_error
✅ Finished temp_error
✅ Finished pressure_error
✅ 제출 파일 저장 완료: ./xgboost_output.csv
