In [None]:
import joblib
import pandas as pd

X_train = joblib.load("../output/X_train.pkl")
y_train = joblib.load("../output/y_train.pkl")
X_val = joblib.load("../output/X_val.pkl")
y_val = joblib.load("../output/y_val.pkl")

In [None]:
from sklearn.preprocessing import LabelEncoder

# base_date 제거
for df in [X_train, X_val]:
    if 'base_date' in df.columns:
        df.drop(columns=['base_date'], inplace=True)

# object 컬럼 → Label Encoding
obj_cols = X_train.select_dtypes(include=['object']).columns

for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])


In [None]:
# XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb = XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=10, random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_val)
mae_xgb = mean_absolute_error(y_val, xgb_preds)
print(" XGBoost MAE:", round(mae_xgb, 4))

In [None]:
# LightGBM
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(n_estimators=500, learning_rate=0.1, max_depth=10, random_state=42)
lgbm.fit(X_train, y_train)
lgbm_preds = lgbm.predict(X_val)
mae_lgbm = mean_absolute_error(y_val, lgbm_preds)
print("LightGBM MAE:", round(mae_lgbm, 4))

In [None]:
# Random Forest & ExtraTrees
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error

# RandomForest
rf = RandomForestRegressor(
    n_estimators=40,          
    min_samples_leaf=10,       
    min_samples_split=10,      
    n_jobs=-1,               
    random_state=42,
    verbose=1                 
)

# ExtraTrees
et = ExtraTreesRegressor(
    n_estimators=40,
    min_samples_leaf=10,
    min_samples_split=10,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf.fit(X_train, y_train)
et.fit(X_train, y_train)

rf_preds = rf.predict(X_val)
et_preds = et.predict(X_val)

mae_rf = mean_absolute_error(y_val, rf_preds)
mae_et = mean_absolute_error(y_val, et_preds)

print("RandomForest MAE:", round(mae_rf, 4))
print("ExtraTrees MAE:", round(mae_et, 4))

In [None]:
# 결과 비교
print(f"XGBoost MAE:     {mae_xgb:.4f}")
print(f"LightGBM MAE:    {mae_lgbm:.4f}")
print(f"RandomForest MAE:{mae_rf:.4f}")
print(f"ExtraTrees MAE:  {mae_et:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

mae_scores = {
    "XGBoost": mae_xgb,
    "LightGBM": mae_lgbm,
    "RandomForest": mae_rf,
    "ExtraTrees": mae_et,
}

sns.set_style("whitegrid")
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["font.family"] = "AppleGothic"  

plt.figure(figsize=(8, 5))
plt.bar(mae_scores.keys(), mae_scores.values(), color='skyblue')
plt.title("모델별 MAE 비교")
plt.ylabel("Mean Absolute Error")
plt.ylim(0, max(mae_scores.values()) + 1)
plt.xticks(rotation=15)
plt.show()

In [None]:
import pandas as pd
import joblib

# 평균값 예측 baseline
mean_target = y_train.mean()

test_ids = pd.read_csv("../data/test.csv")["id"]
preds_baseline = [mean_target] * len(test_ids)

# DataFrame으로 저장
baseline_df = pd.DataFrame({
    "id": test_ids,
    "target": preds_baseline
})

baseline_df.to_csv("../output/final_prediction_baseline.csv", index=False)
print("baseline 예측 결과 저장 완료")


In [None]:
best_model = rf

In [None]:
import joblib
import os

# 학습에 사용한 feature column 목록 저장
feature_columns = X_train.columns.tolist()

joblib.dump(feature_columns, "../output/feature_columns.pkl")

In [None]:
import joblib

joblib.dump(rf, "../output/rf_model.pkl")

In [None]:
import joblib
import pandas as pd

rf_model = joblib.load("../output/rf_model.pkl")
test_X = joblib.load("../output/test_X.pkl")

# 예측 시 불필요한 컬럼 제거
test_X = test_X.drop(columns=["base_date"], errors="ignore")

# 범주형 컬럼 중 숫자 아닌 것 자동 인코딩 (LabelEncoder 없이)
for col in test_X.select_dtypes(include=["object"]).columns:
    test_X[col] = pd.factorize(test_X[col])[0]

preds = rf_model.predict(test_X)

# 예측 결과 저장 (id 붙여서)
test_ids = pd.read_csv("../data/test.csv")["id"]
final_df = pd.DataFrame({
    "id": test_ids,
    "target": preds
})

final_df.to_csv("../output/final_prediction_rf.csv", index=False)
print("최종 예측 결과 저장 완료!")