In [None]:
!pip install catboost

In [None]:
print("지금 죽지 마")

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

scaler = StandardScaler()


# -------------------------------------------------
# 1. CSV 파일 읽기 + 최근 30000개만 사용
# -------------------------------------------------
try:
    del df
    del df_test
except:
    pass

file_path = "/content/final_heatmap_lag_without_leakage.csv"
df_raw = pd.read_csv(file_path)

# 실험 데이터셋(2021)과 분리
df_raw['datetime'] = pd.to_datetime(df_raw['datetime'])
print(f"전체 로드된 데이터셋: {len(df_raw)}")
print(f"날짜 범위: {df_raw['datetime'].min()} ~ {df_raw['datetime'].max()}")

df_test = df_raw[df_raw['datetime'] >= '2021-01-01'].copy().reset_index(drop=True)
df = df_raw[df_raw['datetime']<'2021-01-01'].copy().reset_index(drop=True)

# 첫 168행 제거
df = df.iloc[168:].reset_index(drop=True)
print(f"전체 train&eval 데이터 개수: {len(df)}")
print(f"test 데이터셋 개수: {len(df_test)}")

# -------------------------------------------------
# 2. 첫 번째 열이 y, 나머지 열이 X
# -------------------------------------------------
y = df.iloc[:, 0].values
X = df.iloc[:, 2:].values  # 두 번째 열 제외한 나머지 feature

y_2021 = df_test.iloc[:, 0].values
X_2021 = df_test.iloc[:, 2:].values

print("X shape:", X.shape)
print("y shape:", y.shape)

print("X_2021 shape:", X_2021.shape)
print("y_2021 shape:", y_2021.shape)

# -------------------------------------------------
# 3. Train/Test Split (앞쪽 1/4 = test)
# -------------------------------------------------
test_size = len(df) // 4

X_train, X_test = X[test_size:], X[:test_size]
y_train, y_test = y[test_size:], y[:test_size]

scaler.fit(X_train)

# train과 test 모두 transform
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_2021 = scaler.transform(X_2021)

# -------------------------------------------------
# 4. CatBoost 모델 (MAE + Early Stopping)
# -------------------------------------------------
model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.03,
    depth=8,

    # ◆◆ MAE 기준으로 변경됨 ◆◆
    loss_function='RMSE',
    eval_metric='MAE',

    random_seed=42,
    l2_leaf_reg=3,
    subsample=0.8,
    bootstrap_type='Bernoulli',

    # Early stopping
    od_type='Iter',
    od_wait=200,
    verbose=200
)

# -------------------------------------------------
# 5. Fit
# -------------------------------------------------
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

# -------------------------------------------------
# 6. Predict
# -------------------------------------------------
y_pred = model.predict(X_2021)
print("Sample predictions:", y_pred[:5])
mae = mean_absolute_error(y_2021, y_pred)
print("MAE:", mae)


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# -------------------------------------------------
# 1. CSV 읽기 + 최근 30000개만 사용
# -------------------------------------------------
try:
    del df
    del df_test
except:
    pass

file_path = "/content/final_heatmap_lag_without_leakage.csv"
df_raw = pd.read_csv(file_path)

# 실험 데이터셋(2021)과 분리
df_raw['datetime'] = pd.to_datetime(df_raw['datetime'])
print(f"전체 로드된 데이터셋: {len(df_raw)}")
print(f"날짜 범위: {df_raw['datetime'].min()} ~ {df_raw['datetime'].max()}")

df_test = df_raw[df_raw['datetime'] >= '2021-01-01'].copy().reset_index(drop=True)
df = df_raw[df_raw['datetime']<'2021-01-01'].copy().reset_index(drop=True)

# 첫 168행 제거
df = df.iloc[169:].reset_index(drop=True)
print(f"전체 train&eval 데이터 개수: {len(df)}")
print(f"test 데이터셋 개수: {len(df_test)}")

# -------------------------------------------------
# 2. 첫 번째 열 = y, 나머지 = X
# -------------------------------------------------
y = df.iloc[:, 0].values
X = df.iloc[:, 2:].values

y_2021 = df_test.iloc[:, 0].values
X_2021 = df_test.iloc[:, 2:].values

# -------------------------------------------------
# 3. Train/Test Split (앞쪽 1/4 test)
# -------------------------------------------------
test_size = len(df) // 4
X_train, X_test = X[test_size:], X[:test_size]
y_train, y_test = y[test_size:], y[:test_size]

scaler2 = StandardScaler()
scaler2.fit(X_train)

# train과 test 모두 transform
X_train = scaler2.transform(X_train)
X_test = scaler2.transform(X_test)

X_2021_scaled = scaler2.transform(X_2021)

# -------------------------------------------------
# 4. Dataset 생성
# -------------------------------------------------
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# -------------------------------------------------
# 5. 파라미터
# -------------------------------------------------
params = {
    "objective": "regression_l2",   # ← MAE 기반 학습
    "metric": "l1",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 2.0,
    "random_state": 32,
}

# -------------------------------------------------
# 6. Train (callback 기반 early stopping)
# -------------------------------------------------
model = lgb.train(
    params,
    train_data,
    num_boost_round=5000,
    valid_sets=[valid_data],

    # ← early stopping 처리
    callbacks=[
        early_stopping(stopping_rounds=200),  # 200회 개선 없으면 stop
        log_evaluation(200)                   # 200 iteration마다 로그 출력
    ]
)

# -------------------------------------------------
# 7. Predict
# -------------------------------------------------
y_pred = model.predict(X_2021_scaled)
print("Sample predictions:", y_pred[:5])
mae = mean_absolute_error(y_2021, y_pred)
print("MAE:", mae)


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error  # 이미 위에 있으면 중복 import는 무시됨

# -------------------------------------------------
# XGBoost용 DMatrix 생성 (위에서 만든 X_train, X_test, X_2021_scaled 사용)
# -------------------------------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test,  label=y_test)
d2021  = xgb.DMatrix(X_2021_scaled)

# -------------------------------------------------
# XGBoost 파라미터 (LightGBM과 비슷하게 맞춤)
# -------------------------------------------------
params_xgb = {
    "objective": "reg:squarederror",  # 회귀 목적함수
    "eval_metric": "mae",             # MAE 기준으로 early stopping
    "eta": 0.03,                      # learning_rate
    "max_depth": 6,                   # num_leaves 비슷한 복잡도
    "subsample": 0.8,                 # bagging_fraction 유사
    "colsample_bytree": 0.9,          # feature_fraction 유사
    "lambda": 2.0,                    # L2 정규화 계수
    "tree_method": "hist",            # 빠른 학습 (GPU 쓰면 "gpu_hist")
    "random_state": 32,
}

# -------------------------------------------------
# XGBoost Train + Early Stopping
# -------------------------------------------------
evals = [(dtrain, "train"), (dvalid, "valid")]

model_xgb = xgb.train(
    params=params_xgb,
    dtrain=dtrain,
    num_boost_round=5000,        # 최대 트리 개수
    evals=evals,                 # 검증 세트 지정
    early_stopping_rounds=200,   # 200번 동안 MAE 개선 없으면 stop
    verbose_eval=200             # 200 iteration마다 로그 출력
)

# -------------------------------------------------
# 2021 데이터에 대한 예측 + MAE 계산
# -------------------------------------------------
y_pred_2021_xgb = model_xgb.predict(d2021, iteration_range=(0, model_xgb.best_iteration + 1))

print("Sample XGBoost predictions:", y_pred_2021_xgb[:5])

mae_xgb = mean_absolute_error(y_2021, y_pred_2021_xgb)
print("XGBoost MAE:", mae_xgb)
