In [95]:
import pandas as pd
import numpy as np
import json

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")

teams_train.shape, teams_test.shape, teams_train_target.shape, teams_test_target.shape

((9913, 111), (2324, 111), (9913, 3), (2324, 3))

# 컬럼 추가

## 상대 팀 추가

In [96]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

## 날짜 추가

In [97]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day
teams_train["hour"] = teams_train["date"].dt.hour
teams_train["minute"] = teams_train["date"].dt.minute

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day
teams_test["hour"] = teams_test["date"].dt.hour
teams_test["minute"] = teams_test["date"].dt.minute

## 데이터 타입 변경

In [98]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [99]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
    "hour",
    "minute",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]
train_target = teams_train_target["result"]
test_target = teams_test_target["result"]

train_ft.shape, test_ft.shape

((9913, 21), (2324, 21))

- 성능점수 확인

In [100]:
base_train_ft = train_ft.copy()
base_test_ft = test_ft.copy()
base_train_target = train_target.copy()
base_test_target = test_target.copy()

base_pre_game_features = pre_game_features.copy()

In [101]:
base_train_ft["side"] = base_train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
base_test_ft["side"] = base_test_ft["side"].map({"Blue": 0, "Red": 1})

cat_base_train_ft = base_train_ft.copy()
cat_base_test_ft = base_test_ft.copy()
cat_features = cat_base_train_ft.select_dtypes("category").columns

base_train_ft.shape, base_test_ft.shape, cat_base_train_ft.shape, cat_base_test_ft.shape

((9913, 21), (2324, 21), (9913, 21), (2324, 21))

In [102]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def preprocess(train_ft, test_ft, pre_game_features):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    pre_game_features.extend(encoder.get_feature_names_out())
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft, pre_game_features

base_train_ft, base_test_ft, base_pre_game_features = preprocess(base_train_ft, base_test_ft, base_pre_game_features)

base_train_ft.shape, base_test_ft.shape

((9913, 28), (2324, 28))

In [103]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


base_train_ft, base_test_ft = scale(base_train_ft, base_test_ft)
cat_base_train_ft, cat_base_test_ft = scale(cat_base_train_ft, cat_base_test_ft)

base_train_ft.shape, base_test_ft.shape, cat_base_train_ft.shape, cat_base_test_ft.shape

((9913, 28), (2324, 28), (9913, 21), (2324, 21))

In [104]:
cutoff_patch = base_train_ft["patch"].quantile(0.8)
train_games = base_train_ft[base_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = base_train_ft[base_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

base_train_x = base_train_ft[base_train_ft["gameid"].isin(train_games)]
base_valid_x = base_train_ft[base_train_ft["gameid"].isin(valid_games)]

base_train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
base_valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

base_train_x = base_train_x.drop(columns=["gameid"])
base_valid_x = base_valid_x.drop(columns=["gameid"])

base_train_x.shape, base_valid_x.shape

((7799, 27), (2114, 27))

In [105]:
cat_base_train_games = cat_base_train_ft[cat_base_train_ft["patch"] < cutoff_patch]["gameid"].unique()
cat_base_valid_games = cat_base_train_ft[cat_base_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

cat_base_train_x = cat_base_train_ft[cat_base_train_ft["gameid"].isin(cat_base_train_games)]
cat_base_valid_x = cat_base_train_ft[cat_base_train_ft["gameid"].isin(cat_base_valid_games)]

cat_base_train_y = teams_train_target[teams_train_target["gameid"].isin(cat_base_train_games)]["result"]
cat_base_valid_y = teams_train_target[teams_train_target["gameid"].isin(cat_base_valid_games)]["result"]

cat_base_train_x = cat_base_train_x.drop(columns=["gameid"])
cat_base_valid_x = cat_base_valid_x.drop(columns=["gameid"])

cat_base_train_x.shape, cat_base_valid_x.shape

((7799, 20), (2114, 20))

In [106]:
base_test_ft = base_test_ft.drop(columns=["gameid"])
cat_base_test_ft = cat_base_test_ft.drop(columns=["gameid"])

In [107]:
# base_train_x.to_csv("vis/data/base_train_x.csv", index=False)
# base_valid_x.to_csv("vis/data/base_valid_x.csv", index=False)
# base_test_ft.to_csv("vis/data/base_test_ft.csv", index=False)
# cat_base_train_x.to_csv("vis/data/cat_base_train_x.csv", index=False)
# cat_base_valid_x.to_csv("vis/data/cat_base_valid_x.csv", index=False)
# cat_base_test_ft.to_csv("vis/data/cat_base_test_ft.csv", index=False)
# base_train_y.to_csv("vis/data/base_train_y.csv", index=False)
# base_valid_y.to_csv("vis/data/base_valid_y.csv", index=False)
# base_test_target.to_csv("vis/data/base_test_target.csv", index=False)

- 검증 데이터 성능점수 확인

In [108]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [109]:
cat_features = cat_base_train_x.select_dtypes("category").columns.tolist()

models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1, verbose=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED, algorithm="SAMME"),
    SVC(random_state=SEED, probability=True),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0, cat_features=cat_features),
]

for model in models:
    if model.__class__.__name__ == "CatBoostClassifier":
        valid_acc = cross_val_score(model, cat_base_train_x, cat_base_train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
        valid_roc_auc = cross_val_score(model, cat_base_train_x, cat_base_train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
        print(f"{model.__class__.__name__} - Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")
    else:
        valid_acc = cross_val_score(model, base_train_x, base_train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
        valid_roc_auc = cross_val_score(model, base_train_x, base_train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
        print(f"{model.__class__.__name__} - Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

LogisticRegression - Accuracy: 0.5058, ROC AUC: 0.5200
LGBMClassifier - Accuracy: 0.5841, ROC AUC: 0.6167
RandomForestClassifier - Accuracy: 0.5509, ROC AUC: 0.5664
HistGradientBoostingClassifier - Accuracy: 0.5832, ROC AUC: 0.6133
AdaBoostClassifier - Accuracy: 0.5567, ROC AUC: 0.5816
SVC - Accuracy: 0.5093, ROC AUC: 0.5185
XGBClassifier - Accuracy: 0.5723, ROC AUC: 0.6008
CatBoostClassifier - Accuracy: 0.6097, ROC AUC: 0.6567


- 테스트 데이터 성능점수 확인

In [110]:
for model in models:
    if model.__class__.__name__ == "CatBoostClassifier":
        model.fit(pd.concat([cat_base_train_x, cat_base_valid_x]), pd.concat([cat_base_train_y, cat_base_valid_y]))
        test_pred = model.predict(cat_base_test_ft)
        test_pred_proba = model.predict_proba(cat_base_test_ft)[:, 1]
        print(f"{model.__class__.__name__} - Accuracy: {accuracy_score(teams_test_target['result'], test_pred):.4f}, ROC AUC: {roc_auc_score(teams_test_target['result'], test_pred_proba):.4f}")
    else:
        model.fit(pd.concat([base_train_x, base_valid_x]), pd.concat([base_train_y, base_valid_y]))
        test_pred = model.predict(base_test_ft)
        test_pred_proba = model.predict_proba(base_test_ft)[:, 1]
        print(f"{model.__class__.__name__} - Accuracy: {accuracy_score(teams_test_target['result'], test_pred):.4f}, ROC AUC: {roc_auc_score(teams_test_target['result'], test_pred_proba):.4f}")


LogisticRegression - Accuracy: 0.5297, ROC AUC: 0.5339
LGBMClassifier - Accuracy: 0.6196, ROC AUC: 0.6657
RandomForestClassifier - Accuracy: 0.5757, ROC AUC: 0.6124
HistGradientBoostingClassifier - Accuracy: 0.6256, ROC AUC: 0.6740
AdaBoostClassifier - Accuracy: 0.5830, ROC AUC: 0.6269
SVC - Accuracy: 0.5284, ROC AUC: 0.5328
XGBClassifier - Accuracy: 0.5968, ROC AUC: 0.6365
CatBoostClassifier - Accuracy: 0.6183, ROC AUC: 0.6769


In [111]:
# import wandb

# cat_features = cat_base_train_x.select_dtypes("category").columns.tolist()


# def train_and_evaluate(
#     train_x,
#     valid_x,
#     test_x,
#     cat_train_x,
#     cat_valid_x,
#     cat_test_x,
#     train_y,
#     valid_y,
#     test_y,
#     cat_features,
#     use_additional_data=False,
# ):
#     models = {
#         "lr": LogisticRegression(random_state=SEED),
#         "lgbm": LGBMClassifier(random_state=SEED, n_jobs=-1),
#         "rf": RandomForestClassifier(random_state=SEED, n_jobs=-1),
#         "hgbc": HistGradientBoostingClassifier(random_state=SEED),
#         "ada": AdaBoostClassifier(random_state=SEED),
#         "svc": SVC(random_state=SEED, probability=True),
#         "xgb": XGBClassifier(random_state=SEED, n_jobs=-1),
#         "cat": CatBoostClassifier(
#             random_state=SEED, verbose=0, cat_features=cat_features
#         ),
#     }

#     for name, model in models.items():
#         run_name_suffix = "additional" if use_additional_data else "base"
        
#         if wandb.run is not None:
#             wandb.finish()
            
#         wandb.init(
#             project="temp-lol-match-prediction2",
#             name=f"{name}_{run_name_suffix}",
#             config={"seed": SEED},
#         )

#         if name == "cat":
#             model.fit(cat_train_x, train_y)
#             valid_accuracy = accuracy_score(valid_y, model.predict(cat_valid_x))
#             valid_roc_auc = roc_auc_score(
#                 valid_y, model.predict_proba(cat_valid_x)[:, 1]
#             )
#             test_accuracy = accuracy_score(test_y, model.predict(cat_test_x))
#             test_roc_auc = roc_auc_score(test_y, model.predict_proba(cat_test_x)[:, 1])
#         else:
#             model.fit(train_x, train_y)
#             valid_accuracy = accuracy_score(valid_y, model.predict(valid_x))
#             valid_roc_auc = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])
#             test_accuracy = accuracy_score(test_y, model.predict(test_x))
#             test_roc_auc = roc_auc_score(test_y, model.predict_proba(test_x)[:, 1])

#         wandb.log(
#             {
#                 "valid_accuracy": valid_accuracy,
#                 "valid_roc_auc": valid_roc_auc,
#                 "test_accuracy": test_accuracy,
#                 "test_roc_auc": test_roc_auc,
#                 "model": name,
#                 "feature_type": "base" if not use_additional_data else "additional",
#             }
#         )
#         wandb.finish()


# train_and_evaluate(
#     base_train_x,
#     base_valid_x,
#     base_test_ft,
#     cat_base_train_x,
#     cat_base_valid_x,
#     cat_base_test_ft,
#     base_train_y,
#     base_valid_y,
#     base_test_target,
#     cat_features,
#     use_additional_data=False,
# )

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [112]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [113]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(["teamname", "year", "month", "day", "hour", "minute"]).reset_index(drop=True)
temp_test = teams_test.sort_values(["teamname", "year", "month", "day", "hour", "minute"]).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby("teamname", observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f"recent10_{col}": recent10_train})

    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(
        ["teamname", "year", "month", "day", "hour", "minute"]
    )
    recent10_combined = combined_data.groupby("teamname", observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f"recent10_{col}": recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f"recent10_{col}"].values
    test_ft = test_ft.assign(**{f"recent10_{col}": recent10_test})

    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[["teamname", "year", "month", "day", "hour", "minute", f"recent10_{col}"]],
        left_on=["opp_teamname", "year", "month", "day", "hour", "minute"],
        right_on=["teamname", "year", "month", "day", "hour", "minute"],
        suffixes=("", "_opp"),
    )
    train_ft = train_ft.assign(
        **{f"opp_recent10_{col}": merged_train[f"recent10_{col}_opp"]}
    )

    merged_test = test_ft.merge(
        combined_data[["teamname", "year", "month", "day", "hour", "minute", f"recent10_{col}"]],
        left_on=["opp_teamname", "year", "month", "day", "hour", "minute"],
        right_on=["teamname", "year", "month", "day", "hour", "minute"],
        suffixes=("", "_opp"),
    )
    test_ft = test_ft.assign(
        **{f"opp_recent10_{col}": merged_test[f"recent10_{col}_opp"]}
    )

    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == "result" else 0
    train_ft = train_ft.assign(
        **{
            f"recent10_{col}": train_ft[f"recent10_{col}"].fillna(default_value),
            f"opp_recent10_{col}": train_ft[f"opp_recent10_{col}"].fillna(
                default_value
            ),
        }
    )
    test_ft = test_ft.assign(
        **{
            f"recent10_{col}": test_ft[f"recent10_{col}"].fillna(default_value),
            f"opp_recent10_{col}": test_ft[f"opp_recent10_{col}"].fillna(default_value),
        }
    )

    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f"recent10_{col}", f"opp_recent10_{col}"])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 상대 전적

In [114]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day', 'hour', 'minute'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 80), (2324, 80))

### 각 팀의 리그별 승률

In [115]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day', 'hour', 'minute'])

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 81), (2324, 81))

# 인코딩

In [116]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [117]:
featured_data = pd.concat([train_ft, test_ft], ignore_index=True)
featured_data.to_csv("LoLesports_data/featured_data.csv", index=False)

In [118]:
cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()
cat_pre_game_features = pre_game_features.copy()

In [119]:
train_ft, test_ft, pre_game_features = preprocess(train_ft, test_ft, pre_game_features)

In [120]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [121]:
train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

train_ft.shape, test_ft.shape, cat_train_ft.shape, cat_test_ft.shape

((9913, 88), (2324, 88), (9913, 81), (2324, 81))

In [122]:
cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x = train_x.drop(columns=["gameid"])
valid_x = valid_x.drop(columns=["gameid"])

train_x.shape, valid_x.shape

((7799, 87), (2114, 87))

In [124]:
valid_df = teams_train_target[teams_train_target["gameid"].isin(valid_games)]

In [125]:
cat_train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
cat_valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

cat_train_x = cat_train_ft[cat_train_ft["gameid"].isin(cat_train_games)]
cat_valid_x = cat_train_ft[cat_train_ft["gameid"].isin(cat_valid_games)]

cat_train_y = teams_train_target[teams_train_target["gameid"].isin(cat_train_games)]["result"]
cat_valid_y = teams_train_target[teams_train_target["gameid"].isin(cat_valid_games)]["result"]

cat_train_x = cat_train_x.drop(columns=["gameid"])
cat_valid_x = cat_valid_x.drop(columns=["gameid"])

cat_train_x.shape, cat_valid_x.shape

((7799, 80), (2114, 80))

In [126]:
test_ft = test_ft.drop(columns=["gameid"])
cat_test_ft = cat_test_ft.drop(columns=["gameid"])

In [127]:
for model in models:
    if model.__class__.__name__ == "CatBoostClassifier":
        valid_acc = cross_val_score(model, cat_valid_x, cat_valid_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
        valid_roc_auc = cross_val_score(model, cat_valid_x, cat_valid_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
        print(f"{model.__class__.__name__} - Valid Accuracy: {np.mean(valid_acc):.4f}, Valid ROC AUC: {np.mean(valid_roc_auc):.4f}")
    else:
        valid_acc = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
        valid_roc_auc = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
        print(f"{model.__class__.__name__} - Valid Accuracy: {np.mean(valid_acc):.4f}, Valid ROC AUC: {np.mean(valid_roc_auc):.4f}")

LogisticRegression - Valid Accuracy: 0.6667, Valid ROC AUC: 0.7349
LGBMClassifier - Valid Accuracy: 0.7238, Valid ROC AUC: 0.8074
RandomForestClassifier - Valid Accuracy: 0.6593, Valid ROC AUC: 0.7304
HistGradientBoostingClassifier - Valid Accuracy: 0.7132, Valid ROC AUC: 0.8014
AdaBoostClassifier - Valid Accuracy: 0.6654, Valid ROC AUC: 0.7353
SVC - Valid Accuracy: 0.6650, Valid ROC AUC: 0.7356
XGBClassifier - Valid Accuracy: 0.7149, Valid ROC AUC: 0.8019
CatBoostClassifier - Valid Accuracy: 0.7159, Valid ROC AUC: 0.8206


In [128]:
for model in models:
    if model.__class__.__name__ == "CatBoostClassifier":
        model.fit(pd.concat([cat_train_x, cat_valid_x]), pd.concat([cat_train_y, cat_valid_y]))
        test_pred = model.predict(cat_test_ft)
        test_pred_proba = model.predict_proba(cat_test_ft)[:, 1]
        print(f"{model.__class__.__name__} - Accuracy: {accuracy_score(teams_test_target['result'], test_pred):.4f}, ROC AUC: {roc_auc_score(teams_test_target['result'], test_pred_proba):.4f}")
    else:
        model.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
        test_pred = model.predict(test_ft)
        test_pred_proba = model.predict_proba(test_ft)[:, 1]
        print(f"{model.__class__.__name__} - Accuracy: {accuracy_score(teams_test_target['result'], test_pred):.4f}, ROC AUC: {roc_auc_score(teams_test_target['result'], test_pred_proba):.4f}")

LogisticRegression - Accuracy: 0.6609, ROC AUC: 0.7195


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LGBMClassifier - Accuracy: 0.7560, ROC AUC: 0.8391
RandomForestClassifier - Accuracy: 0.6515, ROC AUC: 0.7053
HistGradientBoostingClassifier - Accuracy: 0.7496, ROC AUC: 0.8394
AdaBoostClassifier - Accuracy: 0.6743, ROC AUC: 0.7397
SVC - Accuracy: 0.6566, ROC AUC: 0.7221
XGBClassifier - Accuracy: 0.7173, ROC AUC: 0.7958
CatBoostClassifier - Accuracy: 0.7207, ROC AUC: 0.8052


In [129]:
# train_x.to_csv("vis/data/train_x.csv", index=False)
# valid_x.to_csv("vis/data/valid_x.csv", index=False)
# test_ft.to_csv("vis/data/test_ft.csv", index=False)
# cat_train_x.to_csv("vis/data/cat_train_x.csv", index=False)
# cat_valid_x.to_csv("vis/data/cat_valid_x.csv", index=False)
# cat_test_ft.to_csv("vis/data/cat_test_ft.csv", index=False)
# train_y.to_csv("vis/data/train_y.csv", index=False)
# valid_y.to_csv("vis/data/valid_y.csv", index=False)
# test_target.to_csv("vis/data/test_target.csv", index=False)

In [130]:
# train_and_evaluate(
#     train_x,
#     valid_x,
#     test_ft,
#     cat_train_x,
#     cat_valid_x,
#     cat_test_ft,
#     train_y,
#     valid_y,
#     test_target,
#     cat_features,
#     use_additional_data=True,
# )

# 모델 학습 및 검증

- 하이퍼파라미터 튜닝 클래스

In [131]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}
        
        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features)
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)
            
        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

### 모든 컬럼 형식이 number인 데이터셋

- LogisticRegression

In [132]:
# params = {
#     "C": {"type": "float", "min": 0.01, "max": 10},
#     "penalty": {"type": "categorical", "values": ["l1", "l2"]},
#     "solver": {"type": "categorical", "values": ["liblinear", "saga"]},
#     "max_iter": {"type": "int", "min": 100, "max": 2000},
# }

# lr_vt_tuner = HyperparameterTuner(LogisticRegression, params, train_x, train_y)
# lr_vt_tuner.optimize(100)
# lr_vt_tuner.best_params(), lr_vt_tuner.best_score()

In [133]:
params = {
    "C": 0.38414964961856957,
    "penalty": "l1",
    "solver": "liblinear",
    "max_iter": 1903
}

lr_tuned = LogisticRegression(**params)
lr_tuned.fit(train_x, train_y)
print(classification_report(valid_y, lr_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68      1060
           1       0.67      0.73      0.70      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



In [134]:
valid_acc = cross_val_score(lr_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(lr_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.6717, ROC AUC: 0.7385


In [135]:
lr_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], lr_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], lr_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.6756, ROC AUC: 0.7258


- LightGBM

In [136]:
# params = {
#     "n_estimators": {"type": "int", "min": 100, "max": 300},
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1}, 
#     "max_depth": {"type": "int", "min": 8, "max": 12},
#     "num_leaves": {"type": "int", "min": 100, "max": 150},
#     "min_child_samples": {"type": "int", "min": 5, "max": 15},
#     "subsample": {"type": "float", "min": 0.5, "max": 0.7},
#     "colsample_bytree": {"type": "float", "min": 0.4, "max": 0.6},
#     "reg_alpha": {"type": "float", "min": 0.001, "max": 0.1},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 6.0},
#     "verbose": {"type": "int", "min": -1, "max": -1}
# }

# lgbm_tuner = HyperparameterTuner(LGBMClassifier, params, train_x, train_y)
# lgbm_tuner.optimize(100)
# lgbm_tuner.best_params(), lgbm_tuner.best_score()

In [137]:
params = {
    "n_estimators": 285,
    "learning_rate": 0.023007542937157222,
    "max_depth": 9,
    "num_leaves": 101,
    "min_child_samples": 9,
    "subsample": 0.6754653255644233,
    "colsample_bytree": 0.5153479009794544,
    "reg_alpha": 0.07512515626736012,
    "reg_lambda": 3.3370499751525755,
    "verbose": -1,
}

lgbm_tuned = LGBMClassifier(**params)
lgbm_tuned.fit(train_x, train_y)
print(classification_report(valid_y, lgbm_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1060
           1       0.76      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



In [139]:
valid_acc = cross_val_score(lgbm_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(lgbm_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7242, ROC AUC: 0.8062


In [140]:
lgbm_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], lgbm_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], lgbm_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.7522, ROC AUC: 0.8370


- RandomForestClassifier

In [141]:
# params = {
#     "n_estimators": {"type": "int", "min": 800, "max": 1100},
#     "max_depth": {"type": "int", "min": 15, "max": 21},
#     "min_samples_split": {"type": "int", "min": 15, "max": 23},
#     "min_samples_leaf": {"type": "int", "min": 7, "max": 11},
#     "max_features": {"type": "float", "min": 0.7, "max": 0.85},
#     "bootstrap": {"type": "categorical", "values": [False]},
#     "class_weight": {"type": "categorical", "values": ["balanced"]}
# }

# rf_tuner = HyperparameterTuner(RandomForestClassifier, params, train_x, train_y)
# rf_tuner.optimize(20)
# rf_tuner.best_params(), rf_tuner.best_score()

In [142]:
params = {
    "n_estimators": 954,
    "max_depth": 18,
    "min_samples_split": 19,
    "min_samples_leaf": 9,
    "max_features": 0.7814902230628112,
    "bootstrap": False,
    "class_weight": "balanced",
}

rf_tuned = RandomForestClassifier(**params)
rf_tuned.fit(train_x, train_y)
print(classification_report(valid_y, rf_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77      1060
           1       0.76      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



In [143]:
valid_acc = cross_val_score(rf_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(rf_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7076, ROC AUC: 0.7986


In [144]:
rf_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], rf_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], rf_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.7466, ROC AUC: 0.8405


- HistGradientBoostingClassifier

In [145]:
# params = {
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1},
#     "max_depth": {"type": "int", "min": 8, "max": 15},
#     "max_iter": {"type": "int", "min": 100, "max": 300},
#     "min_samples_leaf": {"type": "int", "min": 5, "max": 15},
#     "l2_regularization": {"type": "float", "min": 0.5, "max": 3.0},
#     "max_leaf_nodes": {"type": "int", "min": 40, "max": 90}
# }

# hgbc_tuner = HyperparameterTuner(HistGradientBoostingClassifier, params, train_x, train_y)
# hgbc_tuner.optimize(100)
# hgbc_tuner.best_params(), hgbc_tuner.best_score()

In [146]:
params = {
    "learning_rate": 0.022119818280047138,
    "max_depth": 12,
    "max_iter": 276,
    "min_samples_leaf": 5,
    "l2_regularization": 0.9584267642328501,
    "max_leaf_nodes": 44,
}


hgbc_tuned = HistGradientBoostingClassifier(**params)
hgbc_tuned.fit(train_x, train_y)
print(classification_report(valid_y, hgbc_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [147]:
valid_acc = cross_val_score(hgbc_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(hgbc_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7336, ROC AUC: 0.8195


In [148]:
hgbc_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], hgbc_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], hgbc_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.7586, ROC AUC: 0.8497


- AdaBoostClassifier

In [149]:
# params = {
#     "n_estimators": {"type": "int", "min": 200, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.1, "max": 0.25},
#     "algorithm": {"type": "categorical", "values": ["SAMME.R"]}
# }

# ada_tuner = HyperparameterTuner(AdaBoostClassifier, params, train_x, train_y)
# ada_tuner.optimize(50)
# ada_tuner.best_params(), ada_tuner.best_score()

In [150]:
params = {
    "n_estimators": 358,
    "learning_rate": 0.13883883597100793,
    "algorithm": "SAMME.R",
}


ada_tuned = AdaBoostClassifier(**params)
ada_tuned.fit(train_x, train_y)
print(classification_report(valid_y, ada_tuned.predict(valid_x)))



              precision    recall  f1-score   support

           0       0.72      0.75      0.74      1060
           1       0.74      0.71      0.73      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114



In [151]:
valid_acc = cross_val_score(ada_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(ada_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.6674, ROC AUC: 0.7350


In [152]:
ada_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], ada_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], ada_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")



Accuracy: 0.6906, ROC AUC: 0.7462


- SVC

In [153]:
# params = {
#     "C": {"type": "float", "min": 0.1, "max": 0.5},
#     "kernel": {"type": "categorical", "values": ["linear"]},
#     "degree": {"type": "int", "min": 3, "max": 5},
#     "gamma": {"type": "float", "min": 0.4, "max": 0.9},
#     "coef0": {"type": "float", "min": 1.5, "max": 4.0},
#     "class_weight": {"type": "categorical", "values": [None]}
# }

# svc_tuner = HyperparameterTuner(SVC, params, train_x, train_y)
# svc_tuner.optimize(100)
# svc_tuner.best_params(), svc_tuner.best_score()

In [154]:
params = {
    "C": 0.21950805677161292,
    "kernel": "linear",
    "degree": 4,
    "gamma": 0.671045772731431,
    "coef0": 2.7929809033044726,
    "class_weight": None,
}

svc_tuned = SVC(**params, probability=True)
svc_tuned.fit(train_x, train_y)
print(classification_report(valid_y, svc_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67      1060
           1       0.67      0.75      0.71      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



In [155]:
valid_acc = cross_val_score(svc_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(svc_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.6659, ROC AUC: 0.7356


In [156]:
svc_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], svc_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], svc_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.6571, ROC AUC: 0.7199


- XGBClassifier

In [157]:
# params = {
#     "n_estimators": {"type": "int", "min": 300, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.005, "max": 0.02},
#     "max_depth": {"type": "int", "min": 4, "max": 6},
#     "min_child_weight": {"type": "int", "min": 2, "max": 4},
#     "gamma": {"type": "float", "min": 0.1, "max": 0.4},
#     "subsample": {"type": "float", "min": 0.8, "max": 1.0},
#     "colsample_bytree": {"type": "float", "min": 0.9, "max": 1.0},
#     "reg_alpha": {"type": "float", "min": 0.05, "max": 0.2},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 4.5}
# }

# sgb_tuner = HyperparameterTuner(XGBClassifier, params, train_x, train_y)
# sgb_tuner.optimize(100)
# sgb_tuner.best_params(), sgb_tuner.best_score()

In [158]:
params = {
    "n_estimators": 337,
    "learning_rate": 0.015272630148352066,
    "max_depth": 5,
    "min_child_weight": 2,
    "gamma": 0.24988522273215766,
    "subsample": 0.9639840429354903,
    "colsample_bytree": 0.985608479043216,
    "reg_alpha": 0.1856156681311941,
    "reg_lambda": 3.4637470458659014,
}


xgb_tuned = XGBClassifier(**params)
xgb_tuned.fit(train_x, train_y)
print(classification_report(valid_y, xgb_tuned.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78      1060
           1       0.78      0.80      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114



In [159]:
valid_acc = cross_val_score(xgb_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(xgb_tuned, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7226, ROC AUC: 0.8065


In [160]:
xgb_tuned.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], xgb_tuned.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], xgb_tuned.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.7371, ROC AUC: 0.8190


### category 형식 컬럼이 포함된 데이터셋

In [161]:
cat_features = cat_train_x.select_dtypes("category").columns.tolist()
cat_features

['league',
 'teamname',
 'opp_teamname',
 'ban1',
 'ban2',
 'ban3',
 'ban4',
 'ban5',
 'pick1',
 'pick2',
 'pick3',
 'pick4',
 'pick5']

In [162]:
# params = {
#     "iterations": {"type": "int", "min": 300, "max": 600},
#     "learning_rate": {"type": "float", "min": 0.15, "max": 0.3},
#     "depth": {"type": "int", "min": 8, "max": 12},
#     "l2_leaf_reg": {"type": "float", "min": 6.0, "max": 10.0},
#     "min_child_samples": {"type": "int", "min": 8, "max": 16},
#     "max_bin": {"type": "int", "min": 300, "max": 400},
#     "verbose": {"type": "int", "min": 100, "max": 100}
# }

# cat_tuner = HyperparameterTuner(CatBoostClassifier, params, cat_train_x, cat_train_y, cat_features)
# cat_tuner.optimize(20)
# cat_tuner.best_params(), cat_tuner.best_score()

In [163]:
params = {
    "iterations": 413,
    "learning_rate": 0.24110432469185597,
    "depth": 10,
    "l2_leaf_reg": 8.905869555950142,
    "min_child_samples": 12,
    "max_bin": 342,
    "verbose": 100,
}

cat_tuned = CatBoostClassifier(**params, cat_features=cat_features)
cat_tuned.fit(cat_train_x, cat_train_y)
print(classification_report(cat_valid_y, cat_tuned.predict(cat_valid_x)))

0:	learn: 0.6079555	total: 226ms	remaining: 1m 33s
100:	learn: 0.1385176	total: 26.5s	remaining: 1m 21s
200:	learn: 0.0588177	total: 52.8s	remaining: 55.7s
300:	learn: 0.0327123	total: 1m 18s	remaining: 29.4s
400:	learn: 0.0213888	total: 1m 46s	remaining: 3.19s
412:	learn: 0.0205659	total: 1m 49s	remaining: 0us
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1060
           1       0.79      0.77      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [164]:
valid_acc = cross_val_score(cat_tuned, cat_train_x, cat_train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(cat_tuned, cat_train_x, cat_train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7070, ROC AUC: 0.7889


In [165]:
cat_tuned.fit(pd.concat([cat_train_x, cat_valid_x]), pd.concat([cat_train_y, cat_valid_y]))
test_acc = accuracy_score(teams_test_target["result"], cat_tuned.predict(cat_test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], cat_tuned.predict_proba(cat_test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

0:	learn: 0.6275820	total: 256ms	remaining: 1m 45s
100:	learn: 0.1731344	total: 27.5s	remaining: 1m 25s
200:	learn: 0.0710989	total: 56.1s	remaining: 59.2s
300:	learn: 0.0412594	total: 1m 24s	remaining: 31.3s
400:	learn: 0.0271566	total: 1m 51s	remaining: 3.35s
412:	learn: 0.0259547	total: 1m 55s	remaining: 0us
Accuracy: 0.7207, ROC AUC: 0.8029


In [166]:
# tuned_models = {
#     'lr': lr_final,
#     'lgbm': lgbm_final,
#     'rf': rf_final,
#     'hgbc': hgbc_final,
#     'ada': ada_final,
#     'svc': svc_final,
#     'xgb': xgb_final,
#     'cat': cat_final
# }

# with init_wandb_run("feature_comparison") as run:
#     run.config.update({
#         "features": {
#             "tuned": {
#                 "count": train_ft.shape[1],
#                 "list": train_ft.columns.tolist()
#             }
#         },
#         "models": list(tuned_models.keys())
#     })
    
#     log_model_performance_by_features(
#         "tuned",
#         tuned_models,
#         train_x,
#         valid_x,
#         test_ft,
#         train_y,
#         valid_y,
#         teams_test_target["result"],
#         cat_train_x,
#         cat_valid_x,
#         cat_test_ft
#     )

# 앙상블

- StackingClassifier

In [167]:
from sklearn.ensemble import StackingClassifier

estimators = [
    # ("lr", lr_tuned),
    ("lgbm", lgbm_tuned),
    ("rf", rf_tuned),
    ("hgbc", hgbc_tuned),
    # ("ada", ada_tuned),
    # ("svc", svc_tuned),
    ("xgb", xgb_tuned),
]

final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators, final_estimator)
stacking_clf.fit(train_x, train_y)
print(classification_report(valid_y, stacking_clf.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [168]:
valid_acc = cross_val_score(stacking_clf, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
valid_roc_auc = cross_val_score(stacking_clf, train_x, train_y, cv=TimeSeriesSplit(5), scoring="roc_auc", n_jobs=-1)
print(f"Accuracy: {np.mean(valid_acc):.4f}, ROC AUC: {np.mean(valid_roc_auc):.4f}")

Accuracy: 0.7355, ROC AUC: 0.8198


In [169]:
stacking_clf.fit(pd.concat([train_x, valid_x]), pd.concat([train_y, valid_y]))
test_acc = accuracy_score(teams_test_target["result"], stacking_clf.predict(test_ft))
test_roc_auc = roc_auc_score(teams_test_target["result"], stacking_clf.predict_proba(test_ft)[:, 1])
print(f"Accuracy: {np.mean(test_acc):.4f}, ROC AUC: {np.mean(test_roc_auc):.4f}")

Accuracy: 0.7539, ROC AUC: 0.8505


- StackingClassifier + CatBoostClassifier

- 검증셋

In [170]:
stacking_clf.fit(train_x, train_y)
cat_tuned.fit(cat_train_x, cat_train_y)

stacking_test_proba = stacking_clf.predict_proba(valid_x)[:, 1]
cat_test_proba = cat_tuned.predict_proba(cat_valid_x)[:, 1]

combined_proba = np.mean([stacking_test_proba, cat_test_proba], axis=0)
acc = accuracy_score(valid_y, (combined_proba >= 0.5).astype(int))
roc_auc = roc_auc_score(valid_y, combined_proba)
print(f"Accuracy: {acc:.4f}, ROC AUC: {roc_auc:.4f}")

0:	learn: 0.6079555	total: 207ms	remaining: 1m 25s
100:	learn: 0.1385176	total: 23.4s	remaining: 1m 12s
200:	learn: 0.0588177	total: 46.9s	remaining: 49.5s
300:	learn: 0.0327123	total: 1m 10s	remaining: 26.2s
400:	learn: 0.0213888	total: 1m 34s	remaining: 2.82s
412:	learn: 0.0205659	total: 1m 36s	remaining: 0us
Accuracy: 0.7961, ROC AUC: 0.8907


In [172]:
pred_valid_proba = pd.DataFrame({'gameid': valid_df['gameid'], "teamname": valid_df['teamname'], 'win_pred': combined_proba})
pred_valid_proba.to_csv('output/ensemble_pred_valid.csv', index=False)

- 테스트셋

In [174]:
train_ft = train_ft[train_x.columns]
test_ft = test_ft[train_x.columns]
cat_train_ft = cat_train_ft[cat_train_x.columns]
cat_test_ft = cat_test_ft[cat_train_x.columns]

stacking_clf.fit(train_ft, teams_train_target["result"])
cat_tuned.fit(cat_train_ft, teams_train_target["result"])

stacking_test_proba = stacking_clf.predict_proba(test_ft)[:, 1]
cat_test_proba = cat_tuned.predict_proba(cat_test_ft)[:, 1]

final_test_proba = np.mean([stacking_test_proba, cat_test_proba], axis=0)
final_test_pred = (final_test_proba >= 0.5).astype(int)

print(classification_report(teams_test_target["result"], final_test_pred))

0:	learn: 0.6275820	total: 312ms	remaining: 2m 8s
100:	learn: 0.1691727	total: 36s	remaining: 1m 51s
200:	learn: 0.0750971	total: 1m 10s	remaining: 1m 14s
300:	learn: 0.0393156	total: 1m 45s	remaining: 39.2s
400:	learn: 0.0253351	total: 2m 20s	remaining: 4.2s
412:	learn: 0.0243252	total: 2m 24s	remaining: 0us
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1160
           1       0.76      0.74      0.75      1164

    accuracy                           0.76      2324
   macro avg       0.76      0.76      0.76      2324
weighted avg       0.76      0.76      0.76      2324



In [176]:
test_acc = accuracy_score(teams_test_target["result"], final_test_pred)
test_roc_auc = roc_auc_score(teams_test_target["result"], final_test_proba)
print(f"Accuracy: {test_acc:.4f}, ROC AUC: {test_roc_auc:.4f}")

Accuracy: 0.7556, ROC AUC: 0.8422


In [178]:
pred_test_proba = pd.DataFrame({'gameid': teams_test_target['gameid'], "teamname": teams_test_target['teamname'], 'win_pred': final_test_proba})
pred_test_proba.to_csv('output/ensemble_pred_test.csv', index=False)

# 최종 예측 모델 생성

In [184]:
import joblib

In [185]:
train_data = pd.concat([train_ft, test_ft], ignore_index=True)
cat_train_data = pd.concat([cat_train_ft, cat_test_ft], ignore_index=True)
target_data = pd.concat([teams_train_target, teams_test_target], ignore_index=True)

stacking_clf.fit(train_data, target_data["result"])
cat_tuned.fit(cat_train_data, target_data["result"])

joblib.dump(stacking_clf, "output/stacking_0107.pkl")
cat_tuned.save_model("output/cat_0107.cbm")

0:	learn: 0.6154152	total: 291ms	remaining: 1m 59s
100:	learn: 0.1750981	total: 35.4s	remaining: 1m 49s
200:	learn: 0.0849220	total: 1m 9s	remaining: 1m 13s
300:	learn: 0.0483373	total: 1m 43s	remaining: 38.5s
400:	learn: 0.0301632	total: 2m 16s	remaining: 4.09s
412:	learn: 0.0288545	total: 2m 20s	remaining: 0us


In [186]:
import json

with open("output/cat_features.json", "w") as f:
    json.dump(cat_features, f)