In [87]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")

teams_train.shape, teams_test.shape, teams_train_target.shape, teams_test_target.shape

((9913, 111), (2324, 111), (9913, 3), (2324, 3))

# 컬럼 추가

## 상대 팀 추가

In [88]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

## 날짜 추가

In [89]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day

## 데이터 타입 변경

In [90]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [91]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 19), (2324, 19))

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [92]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [93]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)
temp_test = teams_test.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f'recent10_{col}': recent10_train})
    
    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(['teamname', 'year', 'month', 'day'])
    recent10_combined = combined_data.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f'recent10_{col}': recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f'recent10_{col}'].values
    test_ft = test_ft.assign(**{f'recent10_{col}': recent10_test})
    
    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    train_ft = train_ft.assign(**{f'opp_recent10_{col}': merged_train[f'recent10_{col}_opp']})
    
    merged_test = test_ft.merge(
        combined_data[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    test_ft = test_ft.assign(**{f'opp_recent10_{col}': merged_test[f'recent10_{col}_opp']})
    
    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == 'result' else 0
    train_ft = train_ft.assign(**{
        f'recent10_{col}': train_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': train_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    test_ft = test_ft.assign(**{
        f'recent10_{col}': test_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': test_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    
    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f'recent10_{col}', f'opp_recent10_{col}'])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 77), (2324, 77))

### 상대 전적

In [94]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 78), (2324, 78))

### 각 팀의 리그별 승률

In [95]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values('date')

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 각 패치 버전 사이드별 승률

In [52]:
# # 패치 버전 사이드별 승률 기록을 저장할 딕셔너리
# patch_side_records = {}
# patch_side_winrates = []

# # 날짜순으로 정렬
# combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
# combined_data = combined_data.sort_values('date')

# # 패치/사이드별 승률 계산
# for idx, match in combined_data.iterrows():
#     patch = match['patch']
#     side = match['side']
#     key = (patch, side)
    
#     # 현재 시점까지의 패치/사이드별 승률 계산
#     if key not in patch_side_records:
#         patch_side_records[key] = {'wins': 0, 'total': 0}
#         patch_side_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
#     else:
#         record = patch_side_records[key]
#         patch_side_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
#     # 현재 경기 결과 반영
#     result = match['result']
#     patch_side_records[key]['total'] += 1
#     if result == 1:
#         patch_side_records[key]['wins'] += 1

# # 계산된 승률을 훈련/테스트 데이터에 할당
# train_ft['patch_side_winrate'] = patch_side_winrates[:len(teams_train)]
# test_ft['patch_side_winrate'] = patch_side_winrates[len(teams_train):]

# # 특성 리스트에 patch_side_winrate 추가
# pre_game_features.append('patch_side_winrate')

# # 입력 데이터 업데이트
# train_ft = train_ft[pre_game_features]
# test_ft = test_ft[pre_game_features]

# train_ft.shape, test_ft.shape

### 픽 챔피언 지표

In [53]:
# df = teams_train.copy()
# df = df.sort_values(["teamname", "date", "gameid"])  # 시계열 정렬

# for slot in ["pick1", "pick2", "pick3", "pick4", "pick5"]:
#     # 1) 챔피언 컬럼 만들기
#     df_pick = df[["gameid", "teamname", "date", "result", slot]].copy()
#     df_pick.rename(columns={slot: "champion"}, inplace=True)

#     # 2) 챔피언을 category로 바꾸면 메모리 절약에 도움
#     df_pick["champion"] = df_pick["champion"].astype("category")

#     # 3) groupby + cumsum + shift(1)로 "직전까지" 누적
#     df_pick["pick_ind"] = 1
#     df_pick["win_ind"] = (df_pick["result"] == 1).astype(int)

#     df_pick["cum_pick_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["pick_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["win_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_rate"] = (
#         df_pick["cum_win_count"] / df_pick["cum_pick_count"]
#     ).fillna(0)

#     # 4) 필요한 컬럼만 남겨서, 컬럼 이름으로 바꾸기
#     df_pick = df_pick[
#         ["gameid", "teamname", "date", "champion", "cum_pick_count", "cum_win_rate"]
#     ].copy()

#     df_pick.rename(
#         columns={
#             "champion": f"{slot}_champion",  # 구분용
#             "cum_pick_count": f"{slot}_cum_pick_count",
#             "cum_win_rate": f"{slot}_cum_win_rate",
#         },
#         inplace=True,
#     )
    
#     # 5) 원본 df와 merge
#     df = pd.merge(
#         df,
#         df_pick[
#             [
#                 "gameid",
#                 "teamname",
#                 "date",
#                 f"{slot}_champion",
#                 f"{slot}_cum_pick_count",
#                 f"{slot}_cum_win_rate",
#             ]
#         ],
#         left_on=["gameid", "teamname", "date", f"{slot}"],  # 조인 키
#         right_on=["gameid", "teamname", "date", f"{slot}_champion"],
#         how="left",
#     )

# 인코딩

In [54]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [55]:
cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()

In [56]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def preprocess(train_ft, test_ft):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft

train_ft, test_ft = preprocess(train_ft, test_ft)

In [57]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [58]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

train_ft.shape, test_ft.shape, cat_train_ft.shape, cat_test_ft.shape

((9913, 86), (2324, 86), (9913, 79), (2324, 79))

# 모델 학습 및 검증

- 하이퍼파라미터 튜닝 클래스

In [59]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}
        
        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features)
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)
            
        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

### 모든 컬럼 형식이 number인 데이터셋

In [60]:
pre_game_features.remove("league")

cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

train_x.shape, valid_x.shape

((7799, 77), (2114, 77))

In [61]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [62]:
models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

for model in models:
    scores = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
    print(f"{model.__class__.__name__} : {np.mean(scores)}, {np.std(scores)}")

LogisticRegression : 0.6672825250192456, 0.024409093443106985
LGBMClassifier : 0.7128560431100845, 0.045051417100336576
RandomForestClassifier : 0.6574287913779832, 0.03637606863793496
HistGradientBoostingClassifier : 0.7148575827559661, 0.05088430643347407
AdaBoostClassifier : 0.6578906851424172, 0.03447087991368207
SVC : 0.6568129330254042, 0.024746664599418874
XGBClassifier : 0.7086989992301771, 0.03751368595997046
CatBoostClassifier : 0.7026943802925327, 0.03815527050932491


- LogisticRegression

In [63]:
# params = {
#     "C": {"type": "float", "min": 0.01, "max": 10},
#     "penalty": {"type": "categorical", "values": ["l1", "l2"]},
#     "solver": {"type": "categorical", "values": ["liblinear", "saga"]},
#     "max_iter": {"type": "int", "min": 100, "max": 2000},
# }

# lr_vt_tuner = HyperparameterTuner(LogisticRegression, params, train_x, train_y)
# lr_vt_tuner.optimize(100)
# lr_vt_tuner.best_params(), lr_vt_tuner.best_score()

In [64]:
params = {
    "C": 0.38414964961856957,
    "penalty": "l1",
    "solver": "liblinear",
    "max_iter": 1903
}

lr_final = LogisticRegression(**params)
lr_final.fit(train_x, train_y)
print(classification_report(valid_y, lr_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.71      0.64      0.67      1060
           1       0.67      0.73      0.70      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.68      2114
weighted avg       0.69      0.69      0.68      2114



- LightGBM

In [65]:
# params = {
#     "n_estimators": {"type": "int", "min": 100, "max": 300},
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1}, 
#     "max_depth": {"type": "int", "min": 8, "max": 12},
#     "num_leaves": {"type": "int", "min": 100, "max": 150},
#     "min_child_samples": {"type": "int", "min": 5, "max": 15},
#     "subsample": {"type": "float", "min": 0.5, "max": 0.7},
#     "colsample_bytree": {"type": "float", "min": 0.4, "max": 0.6},
#     "reg_alpha": {"type": "float", "min": 0.001, "max": 0.1},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 6.0},
#     "verbose": {"type": "int", "min": -1, "max": -1}
# }

# lgbm_tuner = HyperparameterTuner(LGBMClassifier, params, train_x, train_y)
# lgbm_tuner.optimize(100)
# lgbm_tuner.best_params(), lgbm_tuner.best_score()

In [66]:
params = {
    "n_estimators": 285,
    "learning_rate": 0.023007542937157222,
    "max_depth": 9,
    "num_leaves": 101,
    "min_child_samples": 9,
    "subsample": 0.6754653255644233,
    "colsample_bytree": 0.5153479009794544,
    "reg_alpha": 0.07512515626736012,
    "reg_lambda": 3.3370499751525755,
    "verbose": -1,
}

lgbm_final = LGBMClassifier(**params)
lgbm_final.fit(train_x, train_y)
print(classification_report(valid_y, lgbm_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



- RandomForestClassifier

In [67]:
# params = {
#     "n_estimators": {"type": "int", "min": 800, "max": 1100},
#     "max_depth": {"type": "int", "min": 15, "max": 21},
#     "min_samples_split": {"type": "int", "min": 15, "max": 23},
#     "min_samples_leaf": {"type": "int", "min": 7, "max": 11},
#     "max_features": {"type": "float", "min": 0.7, "max": 0.85},
#     "bootstrap": {"type": "categorical", "values": [False]},
#     "class_weight": {"type": "categorical", "values": ["balanced"]}
# }

# rf_tuner = HyperparameterTuner(RandomForestClassifier, params, train_x, train_y)
# rf_tuner.optimize(20)
# rf_tuner.best_params(), rf_tuner.best_score()

In [68]:
params = {
    "n_estimators": 954,
    "max_depth": 18,
    "min_samples_split": 19,
    "min_samples_leaf": 9,
    "max_features": 0.7814902230628112,
    "bootstrap": False,
    "class_weight": "balanced",
}

rf_final = RandomForestClassifier(**params)
rf_final.fit(train_x, train_y)
print(classification_report(valid_y, rf_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1060
           1       0.76      0.81      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



- HistGradientBoostingClassifier

In [69]:
# params = {
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1},
#     "max_depth": {"type": "int", "min": 8, "max": 15},
#     "max_iter": {"type": "int", "min": 100, "max": 300},
#     "min_samples_leaf": {"type": "int", "min": 5, "max": 15},
#     "l2_regularization": {"type": "float", "min": 0.5, "max": 3.0},
#     "max_leaf_nodes": {"type": "int", "min": 40, "max": 90}
# }

# hgbc_tuner = HyperparameterTuner(HistGradientBoostingClassifier, params, train_x, train_y)
# hgbc_tuner.optimize(100)
# hgbc_tuner.best_params(), hgbc_tuner.best_score()

In [70]:
params = {
    "learning_rate": 0.022119818280047138,
    "max_depth": 12,
    "max_iter": 276,
    "min_samples_leaf": 5,
    "l2_regularization": 0.9584267642328501,
    "max_leaf_nodes": 44,
}


hgbc_final = HistGradientBoostingClassifier(**params)
hgbc_final.fit(train_x, train_y)
print(classification_report(valid_y, hgbc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.78      0.79      1060
           1       0.78      0.79      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114



- AdaBoostClassifier

In [71]:
# params = {
#     "n_estimators": {"type": "int", "min": 200, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.1, "max": 0.25},
#     "algorithm": {"type": "categorical", "values": ["SAMME.R"]}
# }

# ada_tuner = HyperparameterTuner(AdaBoostClassifier, params, train_x, train_y)
# ada_tuner.optimize(50)
# ada_tuner.best_params(), ada_tuner.best_score()

In [72]:
params = {
    "n_estimators": 358,
    "learning_rate": 0.13883883597100793,
    "algorithm": "SAMME.R",
}


ada_final = AdaBoostClassifier(**params)

ada_final.fit(train_x, train_y)

print(classification_report(valid_y, ada_final.predict(valid_x)))



              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1060
           1       0.73      0.73      0.73      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114



- SVC

In [73]:
# params = {
#     "C": {"type": "float", "min": 0.1, "max": 0.5},
#     "kernel": {"type": "categorical", "values": ["linear"]},
#     "degree": {"type": "int", "min": 3, "max": 5},
#     "gamma": {"type": "float", "min": 0.4, "max": 0.9},
#     "coef0": {"type": "float", "min": 1.5, "max": 4.0},
#     "class_weight": {"type": "categorical", "values": [None]}
# }

# svc_tuner = HyperparameterTuner(SVC, params, train_x, train_y)
# svc_tuner.optimize(100)
# svc_tuner.best_params(), svc_tuner.best_score()

In [74]:
params = {
    "C": 0.21950805677161292,
    "kernel": "linear",
    "degree": 4,
    "gamma": 0.671045772731431,
    "coef0": 2.7929809033044726,
    "class_weight": None,
}

svc_final = SVC(**params)
svc_final.fit(train_x, train_y)
print(classification_report(valid_y, svc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.70      0.63      0.67      1060
           1       0.66      0.73      0.69      1054

    accuracy                           0.68      2114
   macro avg       0.68      0.68      0.68      2114
weighted avg       0.68      0.68      0.68      2114



- XGBClassifier

In [75]:
# params = {
#     "n_estimators": {"type": "int", "min": 300, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.005, "max": 0.02},
#     "max_depth": {"type": "int", "min": 4, "max": 6},
#     "min_child_weight": {"type": "int", "min": 2, "max": 4},
#     "gamma": {"type": "float", "min": 0.1, "max": 0.4},
#     "subsample": {"type": "float", "min": 0.8, "max": 1.0},
#     "colsample_bytree": {"type": "float", "min": 0.9, "max": 1.0},
#     "reg_alpha": {"type": "float", "min": 0.05, "max": 0.2},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 4.5}
# }

# sgb_tuner = HyperparameterTuner(XGBClassifier, params, train_x, train_y)
# sgb_tuner.optimize(100)
# sgb_tuner.best_params(), sgb_tuner.best_score()

In [76]:
params = {
    "n_estimators": 337,
    "learning_rate": 0.015272630148352066,
    "max_depth": 5,
    "min_child_weight": 2,
    "gamma": 0.24988522273215766,
    "subsample": 0.9639840429354903,
    "colsample_bytree": 0.985608479043216,
    "reg_alpha": 0.1856156681311941,
    "reg_lambda": 3.4637470458659014,
}


xgb_final = XGBClassifier(**params)

xgb_final.fit(train_x, train_y)

print(classification_report(valid_y, xgb_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1060
           1       0.77      0.80      0.79      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



### category 형식 컬럼이 포함된 데이터셋

In [77]:
cutoff_patch = cat_train_ft["patch"].quantile(0.8)
train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

cat_train_x = cat_train_ft[cat_train_ft["gameid"].isin(train_games)][pre_game_features]
cat_valid_x = cat_train_ft[cat_train_ft["gameid"].isin(valid_games)][pre_game_features]

cat_train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
cat_valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

cat_train_x.drop(columns=["gameid"], inplace=True)
cat_valid_x.drop(columns=["gameid"], inplace=True)

In [78]:
cat_features = cat_train_x.select_dtypes("category").columns.tolist()

In [79]:
# params = {
#     "iterations": {"type": "int", "min": 300, "max": 600},
#     "learning_rate": {"type": "float", "min": 0.15, "max": 0.3},
#     "depth": {"type": "int", "min": 8, "max": 12},
#     "l2_leaf_reg": {"type": "float", "min": 6.0, "max": 10.0},
#     "min_child_samples": {"type": "int", "min": 8, "max": 16},
#     "max_bin": {"type": "int", "min": 300, "max": 400},
#     "verbose": {"type": "int", "min": 100, "max": 100}
# }

# cat_tuner = HyperparameterTuner(CatBoostClassifier, params, cat_train_x, cat_train_y, cat_features)
# cat_tuner.optimize(20)
# cat_tuner.best_params(), cat_tuner.best_score()

In [80]:
params = {
    "iterations": 413,
    "learning_rate": 0.24110432469185597,
    "depth": 10,
    "l2_leaf_reg": 8.905869555950142,
    "min_child_samples": 12,
    "max_bin": 342,
    "verbose": 100,
}

cat_final = CatBoostClassifier(**params, cat_features=cat_features)
cat_final.fit(cat_train_x, cat_train_y)
print(classification_report(cat_valid_y, cat_final.predict(cat_valid_x)))

0:	learn: 0.6229969	total: 455ms	remaining: 3m 7s
100:	learn: 0.1265438	total: 26.3s	remaining: 1m 21s
200:	learn: 0.0531274	total: 52.6s	remaining: 55.5s
300:	learn: 0.0290406	total: 1m 18s	remaining: 29s
400:	learn: 0.0193965	total: 1m 42s	remaining: 3.06s
412:	learn: 0.0185556	total: 1m 45s	remaining: 0us
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1060
           1       0.78      0.75      0.76      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



# 앙상블

In [81]:
from sklearn.ensemble import StackingClassifier

estimators = [
    # ("lr", lr_final),
    ("lgbm", lgbm_final),
    ("rf", rf_final),
    ("hgbc", hgbc_final),
    # ("ada", ada_final),
    # ("svc", svc_final),
    ("xgb", xgb_final),
]

final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators, final_estimator)
stacking_clf.fit(train_x, train_y)
print(classification_report(valid_y, stacking_clf.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      1060
           1       0.79      0.79      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114



In [82]:
stacking_proba = stacking_clf.predict_proba(valid_x)
cat_proba = cat_final.predict_proba(cat_valid_x)

final_proba = 0.5 * stacking_proba + 0.5 * cat_proba

final_pred = (final_proba[:, 1] >= 0.5).astype(int)
print(classification_report(valid_y, final_pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      1060
           1       0.79      0.78      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114



# 테스트 데이터 예측

In [83]:
train_ft = train_ft[train_x.columns]
test_ft = test_ft[train_x.columns]
cat_train_ft = cat_train_ft[cat_train_x.columns]
cat_test_ft = cat_test_ft[cat_train_x.columns]

stacking_clf.fit(train_ft, teams_train_target["result"])
cat_final.fit(cat_train_ft, teams_train_target["result"])

stacking_test_proba = stacking_clf.predict_proba(test_ft)
cat_test_proba = cat_final.predict_proba(cat_test_ft)

final_test_proba = 0.5 * stacking_test_proba + 0.5 * cat_test_proba
final_test_pred = (final_test_proba[:, 1] >= 0.5).astype(int)

print(classification_report(teams_test_target["result"], final_test_pred))

0:	learn: 0.6125006	total: 252ms	remaining: 1m 43s
100:	learn: 0.1604703	total: 27.9s	remaining: 1m 26s
200:	learn: 0.0659976	total: 55.7s	remaining: 58.7s
300:	learn: 0.0356571	total: 1m 24s	remaining: 31.3s
400:	learn: 0.0229581	total: 1m 53s	remaining: 3.39s
412:	learn: 0.0221277	total: 1m 57s	remaining: 0us
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      1160
           1       0.76      0.76      0.76      1164

    accuracy                           0.76      2324
   macro avg       0.76      0.76      0.76      2324
weighted avg       0.76      0.76      0.76      2324



# 최종 예측 모델 생성

In [84]:
import joblib

In [85]:
train_data = pd.concat([train_ft, test_ft], ignore_index=True)
cat_train_data = pd.concat([cat_train_ft, cat_test_ft], ignore_index=True)
target_data = pd.concat([teams_train_target, teams_test_target], ignore_index=True)

stacking_clf.fit(train_data, target_data["result"])
cat_final.fit(cat_train_data, target_data["result"])

joblib.dump(stacking_clf, "output/model1.pkl")
joblib.dump(cat_final, "output/model2.pkl")

0:	learn: 0.6224647	total: 358ms	remaining: 2m 27s
100:	learn: 0.1767020	total: 37.8s	remaining: 1m 56s
200:	learn: 0.0788264	total: 1m 15s	remaining: 1m 19s
300:	learn: 0.0446790	total: 1m 52s	remaining: 41.9s
400:	learn: 0.0286631	total: 2m 29s	remaining: 4.48s
412:	learn: 0.0271961	total: 2m 34s	remaining: 0us


['output/model2.pkl']