In [None]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

In [None]:
teams_train = pd.read_csv(f"{DATA_PATH}TEST88_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}TEST88_test.csv")
teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")

In [None]:
target_train = teams_train_target['result']
target = teams_test_target['result']

In [None]:
teams_train["side"] = teams_train["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
teams_test["side"] = teams_test["side"].map({"Blue": 0, "Red": 1})

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_train_ft = teams_train.copy()
cat_test_ft = teams_test.copy()

def preprocess(teams_train, teams_test):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5']  # 챔피언 레이블인코딩

    champions = pd.concat([
        teams_train[champion_columns_teams],
        teams_test[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        teams_train[col] = le.transform(teams_train[col])
        teams_test[col] = le.transform(teams_test[col])

    encoder = OneHotEncoder()  # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(teams_train[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    teams_train = pd.concat(
        [teams_train, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    teams_train.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(teams_test[["league"]]).toarray()
    teams_test = pd.concat(
        [teams_test, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    teams_test.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            teams_train["teamname"],
            teams_test["teamname"],
            teams_train["opp_teamname"],
            teams_test["opp_teamname"]
        ]
    )
    le_team.fit(all_team_names)

    teams_train["teamname"] = le_team.transform(teams_train["teamname"])
    teams_train["opp_teamname"] = le_team.transform(teams_train["opp_teamname"])

    teams_test["teamname"] = le_team.transform(teams_test["teamname"])
    teams_test["opp_teamname"] = le_team.transform(teams_test["opp_teamname"])

    return teams_train, teams_test

teams_train, teams_test = preprocess(teams_train, teams_test)


In [None]:
teams_train.select_dtypes("object").columns, teams_test.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def scale(train_ft, test_ft):
    numeric_cols = train_ft.select_dtypes(include="number").columns
    train_ft[numeric_cols] = scaler.fit_transform(train_ft[numeric_cols])
    test_ft[numeric_cols] = scaler.transform(test_ft[numeric_cols])
    return train_ft, test_ft

teams_train, teams_test = scale(teams_train, teams_test)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)


In [None]:
cutoff_patch = teams_train["patch"].quantile(0.8)
train_games = teams_train[teams_train["patch"] < cutoff_patch]["gameid"].unique()
valid_games = teams_train[teams_train["patch"] >= cutoff_patch]["gameid"].unique()

train_x = teams_train[teams_train["gameid"].isin(train_games)]
valid_x = teams_train[teams_train["gameid"].isin(valid_games)]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x = train_x.drop(columns=["gameid"], errors="ignore")
valid_x = valid_x.drop(columns=["gameid"], errors="ignore")


In [None]:
teams_train = teams_train.drop(columns=["gameid"], errors="ignore")
teams_test = teams_test.drop(columns=["gameid"], errors="ignore")
cat_train_ft = cat_train_ft.drop(columns=["gameid"], errors="ignore")
cat_test_ft = cat_test_ft.drop(columns=["gameid"], errors="ignore")

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier



In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
def objective_lgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }
    model = LGBMClassifier(**params, random_state=42, n_jobs=-1)
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()

In [None]:
def objective_histgb(trial):
    params = {
        "max_iter": trial.suggest_int("max_iter", 50, 200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 100),
        "l2_regularization": trial.suggest_float("l2_regularization", 0.0, 1.0),
        "max_bins": trial.suggest_int("max_bins", 128, 255),
    }
    model = HistGradientBoostingClassifier(**params, random_state=42)
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()

In [None]:
def objective_catboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.0, 10.0),
    }
    model = CatBoostClassifier(**params, random_seed=42, verbose=0)
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()

In [None]:
def objective_xgboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_leaves": trial.suggest_int("max_leaves", 10, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }
    model = XGBClassifier(**params, random_state=42, n_jobs=-1, tree_method="hist")
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()

In [None]:
def objective_adaboost(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 2.0, log=True),
    }
    model = AdaBoostClassifier(**params, random_state=42)
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()


In [None]:
def objective_randomforest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
    }
    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    scores = cross_val_score(model, train_x, train_y, cv=3, scoring="accuracy")
    return scores.mean()


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier

In [None]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
import optuna

# AdaBoost 최적화
# study_adaboost = optuna.create_study(direction="maximize")
# study_adaboost.optimize(objective_adaboost, n_trials=50)

# print("Best AdaBoost Params:", study_adaboost.best_params)
# print("Best AdaBoost Accuracy:", study_adaboost.best_value)

# RandomForest 최적화
study_randomforest = optuna.create_study(direction="maximize")
study_randomforest.optimize(objective_randomforest, n_trials=50)

print("Best RandomForest Params:", study_randomforest.best_params)
print("Best RandomForest Accuracy:", study_randomforest.best_value)


[I 2025-01-18 07:17:51,428] A new study created in memory with name: no-name-c808412d-b76f-4ab6-b9c4-a17824e8ae1e
[I 2025-01-18 07:22:14,266] Trial 0 finished with value: 0.7357323822772072 and parameters: {'n_estimators': 436, 'max_depth': 18, 'max_features': None, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7357323822772072.
[I 2025-01-18 07:22:49,000] Trial 1 finished with value: 0.7427847002298714 and parameters: {'n_estimators': 88, 'max_depth': 10, 'max_features': None, 'min_samples_split': 7, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.7427847002298714.
[I 2025-01-18 07:22:59,736] Trial 2 finished with value: 0.6691855841990509 and parameters: {'n_estimators': 155, 'max_depth': 26, 'max_features': 'log2', 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.7427847002298714.
[I 2025-01-18 07:25:43,275] Trial 3 finished with value: 0.7384253805704364 and parameters: {'n_estimators': 387, 'max_depth': 40, 'max_f

Best RandomForest Params: {'n_estimators': 320, 'max_depth': 13, 'max_features': None, 'min_samples_split': 14, 'min_samples_leaf': 8}
Best RandomForest Accuracy: 0.7470158147611015


In [None]:
print("Optimizing LightGBM...")
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=50)


[I 2025-01-09 01:50:36,609] A new study created in memory with name: no-name-7c9da009-9ec3-4a0e-8599-7772ef9037d3


Optimizing LightGBM...


[I 2025-01-09 01:50:58,180] Trial 0 finished with value: 0.7531689209853889 and parameters: {'n_estimators': 392, 'num_leaves': 80, 'learning_rate': 0.13837772989846545, 'colsample_bytree': 0.9717564278984343, 'min_child_samples': 32, 'reg_alpha': 0.27750822469568825, 'reg_lambda': 0.3189908521569159}. Best is trial 0 with value: 0.7531689209853889.
[I 2025-01-09 01:51:07,330] Trial 1 finished with value: 0.7450912086502699 and parameters: {'n_estimators': 372, 'num_leaves': 148, 'learning_rate': 0.1966312170874757, 'colsample_bytree': 0.584259247278917, 'min_child_samples': 30, 'reg_alpha': 0.6598648557764274, 'reg_lambda': 0.8573857866682882}. Best is trial 0 with value: 0.7531689209853889.
[I 2025-01-09 01:51:13,205] Trial 2 finished with value: 0.753810242598238 and parameters: {'n_estimators': 237, 'num_leaves': 41, 'learning_rate': 0.131840593776364, 'colsample_bytree': 0.5532051022670699, 'min_child_samples': 22, 'reg_alpha': 0.6450476990167239, 'reg_lambda': 0.36790933320291686

In [None]:
print("Optimizing HistGradientBoosting...")
study_histgb = optuna.create_study(direction="maximize")
study_histgb.optimize(objective_histgb, n_trials=50)

[I 2025-01-09 02:10:52,104] A new study created in memory with name: no-name-4084fe3e-f7c8-44e3-b435-c2fad71d8b3a


Optimizing HistGradientBoosting...


[I 2025-01-09 02:11:05,562] Trial 0 finished with value: 0.7391920955791674 and parameters: {'max_iter': 133, 'learning_rate': 0.212241939129179, 'min_samples_leaf': 17, 'max_leaf_nodes': 24, 'l2_regularization': 0.8837366807525636, 'max_bins': 157}. Best is trial 0 with value: 0.7391920955791674.
[I 2025-01-09 02:11:11,751] Trial 1 finished with value: 0.7473979143852172 and parameters: {'max_iter': 86, 'learning_rate': 0.10853593849588092, 'min_samples_leaf': 50, 'max_leaf_nodes': 65, 'l2_regularization': 0.8671208907457687, 'max_bins': 236}. Best is trial 1 with value: 0.7473979143852172.
[I 2025-01-09 02:11:17,650] Trial 2 finished with value: 0.7588101932696008 and parameters: {'max_iter': 195, 'learning_rate': 0.03921597350554792, 'min_samples_leaf': 20, 'max_leaf_nodes': 14, 'l2_regularization': 0.361410772386467, 'max_bins': 254}. Best is trial 2 with value: 0.7588101932696008.
[I 2025-01-09 02:11:23,031] Trial 3 finished with value: 0.7566306074328392 and parameters: {'max_ite

In [None]:
print("Optimizing CatBoost...")
study_catboost = optuna.create_study(direction="maximize")
study_catboost.optimize(objective_catboost, n_trials=50)

[I 2025-01-09 02:19:15,349] A new study created in memory with name: no-name-c6277638-8c1c-44fc-b347-7ea26da8e599


Optimizing CatBoost...


[I 2025-01-09 02:19:31,468] Trial 0 finished with value: 0.7259860301299316 and parameters: {'n_estimators': 309, 'learning_rate': 0.14465819422723367, 'depth': 3, 'l2_leaf_reg': 4.081338318183294}. Best is trial 0 with value: 0.7259860301299316.
[I 2025-01-09 02:19:52,067] Trial 1 finished with value: 0.7130372135239392 and parameters: {'n_estimators': 323, 'learning_rate': 0.010233207823405319, 'depth': 5, 'l2_leaf_reg': 7.71713767211153}. Best is trial 0 with value: 0.7259860301299316.
[I 2025-01-09 02:19:58,054] Trial 2 finished with value: 0.7190630025354919 and parameters: {'n_estimators': 253, 'learning_rate': 0.03590662763957786, 'depth': 4, 'l2_leaf_reg': 6.439955125965467}. Best is trial 0 with value: 0.7259860301299316.
[I 2025-01-09 02:20:13,079] Trial 3 finished with value: 0.7225255768984127 and parameters: {'n_estimators': 255, 'learning_rate': 0.014368282109344155, 'depth': 6, 'l2_leaf_reg': 2.238167992823714}. Best is trial 0 with value: 0.7259860301299316.
[I 2025-01-

In [None]:
print(study_lgbm.best_params)
# print(study_xgb.best_params)
print(study_histgb.best_params)
print(study_catboost.best_params)

{'n_estimators': 325, 'num_leaves': 167, 'learning_rate': 0.02085779353829406, 'colsample_bytree': 0.6830341564383431, 'min_child_samples': 11, 'reg_alpha': 0.005287540991957829, 'reg_lambda': 0.18334280172739603}
{'max_iter': 80, 'learning_rate': 0.035686841504521775, 'min_samples_leaf': 13, 'max_leaf_nodes': 57, 'l2_regularization': 0.6834596018529604, 'max_bins': 239}
{'n_estimators': 465, 'learning_rate': 0.08696667773364014, 'depth': 9, 'l2_leaf_reg': 4.517122682203731}


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier

# 옵튜나 돌린 모델
lgbm_model = LGBMClassifier(
    n_jobs=-1, n_estimators=325, num_leaves=167, min_child_samples=11,
    learning_rate=0.02085779353829406, colsample_bytree=0.6830341564383431,
    reg_alpha=0.005287540991957829, reg_lambda=0.18334280172739603,
    verbose=-1
)

xgb_model = XGBClassifier(
    n_jobs=-1, n_estimators=102, max_leaves=24, min_child_weight=0.5378756473362285,
    learning_rate=0.08385205741762022, subsample=1.0, colsample_bylevel=0.9882258506417871,
    colsample_bytree=0.9871150418133403, reg_alpha=0.003672573686861848,
    reg_lambda=0.6055800622609863, grow_policy="lossguide", tree_method="hist",
    verbosity=0
)

hgbc_model = HistGradientBoostingClassifier(
    max_iter=80, learning_rate=0.035686841504521775, min_samples_leaf=13,
    max_leaf_nodes=57, l2_regularization=0.6834596018529604, max_bins=239,
    random_state=SEED, verbose=0
)

cat_model = CatBoostClassifier(
    n_estimators=465, learning_rate=0.08696667773364014, depth=9,
    l2_leaf_reg=4.517122682203731, thread_count=-1, verbose=False, random_seed=SEED
)

ada_model = AdaBoostClassifier(
    n_estimators=70,
    learning_rate=1.9652279287062724,
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=320,
    max_depth=13,
    max_features=None,
    min_samples_split=14,
    min_samples_leaf=8,
    random_state=42,
    n_jobs=-1
)


models = [
    ("LGBM", lgbm_model),
    ("XGBoost", xgb_model),
    ("HistGB", hgbc_model),
    ("CatBoost", cat_model),
    ("AdaBoost", ada_model),
    ("RandomForest", rf_model),
]

for model_name, model in models:
    print(f"\n=== {model_name} ===")
    model.fit(train_x, train_y)
    test_pred = model.predict(valid_x)
    test_prob = model.predict_proba(valid_x)[:, 1]
    acc = accuracy_score(valid_y, test_pred)
    auc = roc_auc_score(valid_y, test_prob)

    print(f"Accuracy: {acc:.4f}")
    print(f"AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(valid_y, test_pred))



=== LGBM ===
Accuracy: 0.7711
AUC: 0.8780

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1060
           1       0.77      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114


=== XGBoost ===
Accuracy: 0.7744
AUC: 0.8767

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1060
           1       0.76      0.80      0.78      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.78      0.77      0.77      2114


=== HistGB ===
Accuracy: 0.7739
AUC: 0.8789

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1060
           1       0.76      0.

In [None]:
teams_train.shape, teams_test.shape, teams_train_target.shape, teams_test_target.shape

((9913, 90), (2324, 90), (9913, 3), (2324, 3))

In [None]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape

((7799, 90), (7799,), (2114, 90), (2114,))

In [None]:
target.shape

(2324,)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier

# 옵튜나 돌린 모델
lgbm_model = LGBMClassifier(
    n_jobs=-1, n_estimators=325, num_leaves=167, min_child_samples=11,
    learning_rate=0.02085779353829406, colsample_bytree=0.6830341564383431,
    reg_alpha=0.005287540991957829, reg_lambda=0.18334280172739603,
    verbose=-1
)

xgb_model = XGBClassifier(
    n_jobs=-1, n_estimators=102, max_leaves=24, min_child_weight=0.5378756473362285,
    learning_rate=0.08385205741762022, subsample=1.0, colsample_bylevel=0.9882258506417871,
    colsample_bytree=0.9871150418133403, reg_alpha=0.003672573686861848,
    reg_lambda=0.6055800622609863, grow_policy="lossguide", tree_method="hist",
    verbosity=0
)

hgbc_model = HistGradientBoostingClassifier(
    max_iter=80, learning_rate=0.035686841504521775, min_samples_leaf=13,
    max_leaf_nodes=57, l2_regularization=0.6834596018529604, max_bins=239,
    random_state=SEED, verbose=0
)

cat_model = CatBoostClassifier(
    n_estimators=465, learning_rate=0.08696667773364014, depth=9,
    l2_leaf_reg=4.517122682203731, thread_count=-1, verbose=False, random_seed=SEED
)

ada_model = AdaBoostClassifier(
    n_estimators=70,
    learning_rate=1.9652279287062724,
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=320,
    max_depth=13,
    max_features=None,
    min_samples_split=14,
    min_samples_leaf=8,
    random_state=42,
    n_jobs=-1
)


models = [
    ("LGBM", lgbm_model),
    ("XGBoost", xgb_model),
    ("HistGB", hgbc_model),
    ("CatBoost", cat_model),
    ("AdaBoost", ada_model),
    ("RandomForest", rf_model),
]

for model_name, model in models:
    print(f"\n=== {model_name} ===")

    model.fit(train_x, train_y)
    test_pred = model.predict(teams_test)
    test_prob = model.predict_proba(teams_test)

    acc = accuracy_score(target, test_pred)
    auc = roc_auc_score(target, test_prob[:, 1])


    print(f"Accuracy: {acc:.4f}")
    print(f"AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(target, test_pred))




=== LGBM ===
Accuracy: 0.7560
AUC: 0.8504

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1160
           1       0.76      0.74      0.75      1164

    accuracy                           0.76      2324
   macro avg       0.76      0.76      0.76      2324
weighted avg       0.76      0.76      0.76      2324


=== XGBoost ===
Accuracy: 0.7401
AUC: 0.8300

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1160
           1       0.74      0.74      0.74      1164

    accuracy                           0.74      2324
   macro avg       0.74      0.74      0.74      2324
weighted avg       0.74      0.74      0.74      2324


=== HistGB ===
Accuracy: 0.7603
AUC: 0.8457

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.74      0.76      1160
           1       0.75      0.

In [None]:
import pickle

for model_name, model in models:
    file_path = f"{model_name}_model.pkl"
    with open(file_path, 'wb') as f:
        pickle.dump(model, f)
    print("완")


완
완
완
완
완
완


- 앙상블

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [None]:
fillter = valid_x[(valid_x["gameid"] == gameid) & (valid_x["teamname"] == teamname)]


sub_pred = stacking_clf.predict_proba(filter)
submit["result"] = sub_pred
submit.to_csv("valid_예측.csv", index=False)


In [None]:
sub_pred = stacking_clf.predict_proba(valid_x[gameid][teamname])
submit["result"] = sub_pred
submit.to_csv("valid_예측.csv", index=False)

In [None]:
estimators = [
    ("LGBM", lgbm_model),
    # ("XGBoost", xgb_model),
    ("HistGB", hgbc_model),
    ("CatBoost", cat_model),
    # ("AdaBoost", ada_model),
    ("RandomForest", rf_model),
]


final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)
stacking_clf.fit(train_x, train_y)

print(classification_report(target, stacking_clf.predict(teams_test)))




              precision    recall  f1-score   support

           0       0.76      0.75      0.75      1160
           1       0.76      0.76      0.76      1164

    accuracy                           0.76      2324
   macro avg       0.76      0.76      0.76      2324
weighted avg       0.76      0.76      0.76      2324



In [None]:
import os

os.makedirs("output", exist_ok=True)


In [None]:
import joblib

joblib.dump(stacking_clf, "5_stacking_model_0120.pkl")
print("완")


완


In [None]:
estimators = [
    ("LGBM", lgbm_model),
    # ("XGBoost", xgb_model),
    ("HistGB", hgbc_model),
    (" CatBoost", cat_model),
    ("AdaBoost", ada_model),
    ("RandomForest", rf_model),
]

final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

stacking_clf.fit(train_x, train_y)

print(classification_report(valid_y, stacking_clf.predict(valid_x)))



              precision    recall  f1-score   support

           0       0.78      0.77      0.77      1060
           1       0.77      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



- 검증셋

In [None]:
valid_df = teams_train_target[teams_train_target["gameid"].isin(valid_games)]

In [None]:
stacking_clf.fit(train_x, train_y)
stacking_test_proba = stacking_clf.predict_proba(valid_x)[:, 1]

acc = accuracy_score(valid_y, (stacking_test_proba >= 0.5).astype(int))
roc_auc = roc_auc_score(valid_y, stacking_test_proba)
print(f"Accuracy: {acc:.4f}, ROC AUC: {roc_auc:.4f}")



Accuracy: 0.7734, ROC AUC: 0.8800


In [None]:
pred_valid_proba = pd.DataFrame({'gameid': valid_df['gameid'], "teamname": valid_df['teamname'], 'win_pred': stacking_test_proba})
pred_valid_proba.to_csv('output/valid_예측.csv', index=False)

- 테스트 데이터

In [None]:
train_ft = teams_train[train_x.columns]
test_ft = teams_test[train_x.columns]

stacking_clf.fit(train_ft, teams_train_target["result"])

stacking_test_proba = stacking_clf.predict_proba(test_ft)[:, 1]

final_test_pred = (stacking_test_proba >= 0.5).astype(int)

print(classification_report(teams_test_target["result"], final_test_pred))



              precision    recall  f1-score   support

           0       0.77      0.76      0.76      1160
           1       0.76      0.77      0.77      1164

    accuracy                           0.77      2324
   macro avg       0.77      0.77      0.77      2324
weighted avg       0.77      0.77      0.77      2324



In [None]:
test_acc = accuracy_score(teams_test_target["result"], final_test_pred)
test_roc_auc = roc_auc_score(teams_test_target["result"], final_test_pred)
print(f"Accuracy: {test_acc:.4f}, ROC AUC: {test_roc_auc:.4f}")

Accuracy: 0.7651, ROC AUC: 0.7650


In [None]:
pred_test_proba = pd.DataFrame({'gameid': teams_test_target['gameid'], "teamname": teams_test_target['teamname'], 'win_pred': stacking_test_proba})
pred_test_proba.to_csv('output/test_예측.csv', index=False)