In [1]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")

teams_train.shape, teams_test.shape, teams_train_target.shape, teams_test_target.shape

((9913, 111), (2324, 111), (9913, 3), (2324, 3))

# 컬럼 추가

## 상대 팀 추가

In [2]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

## 날짜 추가

In [3]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day
teams_train["hour"] = teams_train["date"].dt.hour
teams_train["minute"] = teams_train["date"].dt.minute

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day
teams_test["hour"] = teams_test["date"].dt.hour
teams_test["minute"] = teams_test["date"].dt.minute

## 데이터 타입 변경

In [4]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [5]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
    "hour",
    "minute",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 21), (2324, 21))

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [6]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [7]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(["teamname", "year", "month", "day", "hour", "minute"]).reset_index(drop=True)
temp_test = teams_test.sort_values(["teamname", "year", "month", "day", "hour", "minute"]).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby("teamname", observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f"recent10_{col}": recent10_train})

    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(
        ["teamname", "year", "month", "day", "hour", "minute"]
    )
    recent10_combined = combined_data.groupby("teamname", observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f"recent10_{col}": recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f"recent10_{col}"].values
    test_ft = test_ft.assign(**{f"recent10_{col}": recent10_test})

    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[["teamname", "year", "month", "day", "hour", "minute", f"recent10_{col}"]],
        left_on=["opp_teamname", "year", "month", "day", "hour", "minute"],
        right_on=["teamname", "year", "month", "day", "hour", "minute"],
        suffixes=("", "_opp"),
    )
    train_ft = train_ft.assign(
        **{f"opp_recent10_{col}": merged_train[f"recent10_{col}_opp"]}
    )

    merged_test = test_ft.merge(
        combined_data[["teamname", "year", "month", "day", "hour", "minute", f"recent10_{col}"]],
        left_on=["opp_teamname", "year", "month", "day", "hour", "minute"],
        right_on=["teamname", "year", "month", "day", "hour", "minute"],
        suffixes=("", "_opp"),
    )
    test_ft = test_ft.assign(
        **{f"opp_recent10_{col}": merged_test[f"recent10_{col}_opp"]}
    )

    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == "result" else 0
    train_ft = train_ft.assign(
        **{
            f"recent10_{col}": train_ft[f"recent10_{col}"].fillna(default_value),
            f"opp_recent10_{col}": train_ft[f"opp_recent10_{col}"].fillna(
                default_value
            ),
        }
    )
    test_ft = test_ft.assign(
        **{
            f"recent10_{col}": test_ft[f"recent10_{col}"].fillna(default_value),
            f"opp_recent10_{col}": test_ft[f"opp_recent10_{col}"].fillna(default_value),
        }
    )

    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f"recent10_{col}", f"opp_recent10_{col}"])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 상대 전적

In [8]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day', 'hour', 'minute'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 80), (2324, 80))

### 각 팀의 리그별 승률

In [9]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day', 'hour', 'minute'])

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 81), (2324, 81))

# 인코딩

In [13]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [14]:
train_ft.to_csv("LoLesports_data/featured_train.csv", index=False)
test_ft.to_csv("LoLesports_data/featured_test.csv", index=False)

In [15]:
cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()
cat_pre_game_features = pre_game_features.copy()

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def preprocess(train_ft, test_ft):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    pre_game_features.extend(encoder.get_feature_names_out())
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft

train_ft, test_ft = preprocess(train_ft, test_ft)

In [17]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

train_ft.shape, test_ft.shape, cat_train_ft.shape, cat_test_ft.shape

((9913, 88), (2324, 88), (9913, 81), (2324, 81))

# 모델 학습 및 검증

- 하이퍼파라미터 튜닝 클래스

In [19]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}
        
        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features)
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)
            
        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

### 모든 컬럼 형식이 number인 데이터셋

In [20]:
pre_game_features.remove("league")

cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

train_x.shape, valid_x.shape

((7799, 87), (2114, 87))

In [21]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [22]:
models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

for model in models:
    scores = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
    print(f"{model.__class__.__name__} : {np.mean(scores)}, {np.std(scores)}")

LogisticRegression : 0.6666666666666667, 0.02602866877167562
LGBMClassifier : 0.7237875288683603, 0.03853238745277667
RandomForestClassifier : 0.6592763664357199, 0.03672821942421046
HistGradientBoostingClassifier : 0.7131639722863741, 0.04398000548968287
AdaBoostClassifier : 0.6662047729022325, 0.03532338246790577
SVC : 0.6649730561970746, 0.026342796615792145
XGBClassifier : 0.7148575827559661, 0.035730401119286326
CatBoostClassifier : 0.7122401847575057, 0.036312149083314385


- LogisticRegression

In [288]:
# params = {
#     "C": {"type": "float", "min": 0.01, "max": 10},
#     "penalty": {"type": "categorical", "values": ["l1", "l2"]},
#     "solver": {"type": "categorical", "values": ["liblinear", "saga"]},
#     "max_iter": {"type": "int", "min": 100, "max": 2000},
# }

# lr_vt_tuner = HyperparameterTuner(LogisticRegression, params, train_x, train_y)
# lr_vt_tuner.optimize(100)
# lr_vt_tuner.best_params(), lr_vt_tuner.best_score()

In [23]:
params = {
    "C": 0.38414964961856957,
    "penalty": "l1",
    "solver": "liblinear",
    "max_iter": 1903
}

lr_final = LogisticRegression(**params)
lr_final.fit(train_x, train_y)
print(classification_report(valid_y, lr_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68      1060
           1       0.67      0.73      0.70      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



In [24]:
roc_auc_score(valid_y, lr_final.predict_proba(valid_x)[:, 1])

0.7789740789803444

- LightGBM

In [151]:
# params = {
#     "n_estimators": {"type": "int", "min": 100, "max": 300},
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1}, 
#     "max_depth": {"type": "int", "min": 8, "max": 12},
#     "num_leaves": {"type": "int", "min": 100, "max": 150},
#     "min_child_samples": {"type": "int", "min": 5, "max": 15},
#     "subsample": {"type": "float", "min": 0.5, "max": 0.7},
#     "colsample_bytree": {"type": "float", "min": 0.4, "max": 0.6},
#     "reg_alpha": {"type": "float", "min": 0.001, "max": 0.1},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 6.0},
#     "verbose": {"type": "int", "min": -1, "max": -1}
# }

# lgbm_tuner = HyperparameterTuner(LGBMClassifier, params, train_x, train_y)
# lgbm_tuner.optimize(100)
# lgbm_tuner.best_params(), lgbm_tuner.best_score()

In [25]:
params = {
    "n_estimators": 285,
    "learning_rate": 0.023007542937157222,
    "max_depth": 9,
    "num_leaves": 101,
    "min_child_samples": 9,
    "subsample": 0.6754653255644233,
    "colsample_bytree": 0.5153479009794544,
    "reg_alpha": 0.07512515626736012,
    "reg_lambda": 3.3370499751525755,
    "verbose": -1,
}

lgbm_final = LGBMClassifier(**params)
lgbm_final.fit(train_x, train_y)
print(classification_report(valid_y, lgbm_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1060
           1       0.76      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



In [26]:
roc_auc_score(valid_y, lgbm_final.predict_proba(valid_x)[:, 1])

0.8747520675951451

- RandomForestClassifier

In [153]:
# params = {
#     "n_estimators": {"type": "int", "min": 800, "max": 1100},
#     "max_depth": {"type": "int", "min": 15, "max": 21},
#     "min_samples_split": {"type": "int", "min": 15, "max": 23},
#     "min_samples_leaf": {"type": "int", "min": 7, "max": 11},
#     "max_features": {"type": "float", "min": 0.7, "max": 0.85},
#     "bootstrap": {"type": "categorical", "values": [False]},
#     "class_weight": {"type": "categorical", "values": ["balanced"]}
# }

# rf_tuner = HyperparameterTuner(RandomForestClassifier, params, train_x, train_y)
# rf_tuner.optimize(20)
# rf_tuner.best_params(), rf_tuner.best_score()

In [27]:
params = {
    "n_estimators": 954,
    "max_depth": 18,
    "min_samples_split": 19,
    "min_samples_leaf": 9,
    "max_features": 0.7814902230628112,
    "bootstrap": False,
    "class_weight": "balanced",
}

rf_final = RandomForestClassifier(**params)
rf_final.fit(train_x, train_y)
print(classification_report(valid_y, rf_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1060
           1       0.76      0.78      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



In [28]:
roc_auc_score(valid_y, rf_final.predict_proba(valid_x)[:, 1])

0.8731373742436719

- HistGradientBoostingClassifier

In [155]:
# params = {
#     "learning_rate": {"type": "float", "min": 0.01, "max": 0.1},
#     "max_depth": {"type": "int", "min": 8, "max": 15},
#     "max_iter": {"type": "int", "min": 100, "max": 300},
#     "min_samples_leaf": {"type": "int", "min": 5, "max": 15},
#     "l2_regularization": {"type": "float", "min": 0.5, "max": 3.0},
#     "max_leaf_nodes": {"type": "int", "min": 40, "max": 90}
# }

# hgbc_tuner = HyperparameterTuner(HistGradientBoostingClassifier, params, train_x, train_y)
# hgbc_tuner.optimize(100)
# hgbc_tuner.best_params(), hgbc_tuner.best_score()

In [29]:
params = {
    "learning_rate": 0.022119818280047138,
    "max_depth": 12,
    "max_iter": 276,
    "min_samples_leaf": 5,
    "l2_regularization": 0.9584267642328501,
    "max_leaf_nodes": 44,
}


hgbc_final = HistGradientBoostingClassifier(**params)
hgbc_final.fit(train_x, train_y)
print(classification_report(valid_y, hgbc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [30]:
roc_auc_score(valid_y, hgbc_final.predict_proba(valid_x)[:, 1])

0.8852672657620565

- AdaBoostClassifier

In [157]:
# params = {
#     "n_estimators": {"type": "int", "min": 200, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.1, "max": 0.25},
#     "algorithm": {"type": "categorical", "values": ["SAMME.R"]}
# }

# ada_tuner = HyperparameterTuner(AdaBoostClassifier, params, train_x, train_y)
# ada_tuner.optimize(50)
# ada_tuner.best_params(), ada_tuner.best_score()

In [31]:
params = {
    "n_estimators": 358,
    "learning_rate": 0.13883883597100793,
    "algorithm": "SAMME.R",
}


ada_final = AdaBoostClassifier(**params)
ada_final.fit(train_x, train_y)
print(classification_report(valid_y, ada_final.predict(valid_x)))



              precision    recall  f1-score   support

           0       0.72      0.75      0.74      1060
           1       0.74      0.71      0.73      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114



In [32]:
roc_auc_score(valid_y, ada_final.predict_proba(valid_x)[:, 1])

0.8069045146969318

- SVC

In [159]:
# params = {
#     "C": {"type": "float", "min": 0.1, "max": 0.5},
#     "kernel": {"type": "categorical", "values": ["linear"]},
#     "degree": {"type": "int", "min": 3, "max": 5},
#     "gamma": {"type": "float", "min": 0.4, "max": 0.9},
#     "coef0": {"type": "float", "min": 1.5, "max": 4.0},
#     "class_weight": {"type": "categorical", "values": [None]}
# }

# svc_tuner = HyperparameterTuner(SVC, params, train_x, train_y)
# svc_tuner.optimize(100)
# svc_tuner.best_params(), svc_tuner.best_score()

In [35]:
params = {
    "C": 0.21950805677161292,
    "kernel": "linear",
    "degree": 4,
    "gamma": 0.671045772731431,
    "coef0": 2.7929809033044726,
    "class_weight": None,
}

svc_final = SVC(**params, probability=True)
svc_final.fit(train_x, train_y)
print(classification_report(valid_y, svc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67      1060
           1       0.67      0.75      0.71      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



In [36]:
roc_auc_score(valid_y, svc_final.predict_proba(valid_x)[:, 1])

0.7805037413626437

- XGBClassifier

In [161]:
# params = {
#     "n_estimators": {"type": "int", "min": 300, "max": 400},
#     "learning_rate": {"type": "float", "min": 0.005, "max": 0.02},
#     "max_depth": {"type": "int", "min": 4, "max": 6},
#     "min_child_weight": {"type": "int", "min": 2, "max": 4},
#     "gamma": {"type": "float", "min": 0.1, "max": 0.4},
#     "subsample": {"type": "float", "min": 0.8, "max": 1.0},
#     "colsample_bytree": {"type": "float", "min": 0.9, "max": 1.0},
#     "reg_alpha": {"type": "float", "min": 0.05, "max": 0.2},
#     "reg_lambda": {"type": "float", "min": 3.0, "max": 4.5}
# }

# sgb_tuner = HyperparameterTuner(XGBClassifier, params, train_x, train_y)
# sgb_tuner.optimize(100)
# sgb_tuner.best_params(), sgb_tuner.best_score()

In [37]:
params = {
    "n_estimators": 337,
    "learning_rate": 0.015272630148352066,
    "max_depth": 5,
    "min_child_weight": 2,
    "gamma": 0.24988522273215766,
    "subsample": 0.9639840429354903,
    "colsample_bytree": 0.985608479043216,
    "reg_alpha": 0.1856156681311941,
    "reg_lambda": 3.4637470458659014,
}


xgb_final = XGBClassifier(**params)

xgb_final.fit(train_x, train_y)

print(classification_report(valid_y, xgb_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78      1060
           1       0.78      0.80      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114



In [38]:
roc_auc_score(valid_y, xgb_final.predict_proba(valid_x)[:, 1])

0.8745086105044574

### category 형식 컬럼이 포함된 데이터셋

In [39]:
cutoff_patch = cat_train_ft["patch"].quantile(0.8)
train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

cat_train_x = cat_train_ft[cat_train_ft["gameid"].isin(train_games)][cat_pre_game_features]
cat_valid_x = cat_train_ft[cat_train_ft["gameid"].isin(valid_games)][cat_pre_game_features]

cat_train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
cat_valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

cat_train_x.drop(columns=["gameid"], inplace=True)
cat_valid_x.drop(columns=["gameid"], inplace=True)

In [40]:
cat_features = cat_train_x.select_dtypes("category").columns.tolist()
cat_features

['league',
 'teamname',
 'opp_teamname',
 'ban1',
 'ban2',
 'ban3',
 'ban4',
 'ban5',
 'pick1',
 'pick2',
 'pick3',
 'pick4',
 'pick5']

In [166]:
# params = {
#     "iterations": {"type": "int", "min": 300, "max": 600},
#     "learning_rate": {"type": "float", "min": 0.15, "max": 0.3},
#     "depth": {"type": "int", "min": 8, "max": 12},
#     "l2_leaf_reg": {"type": "float", "min": 6.0, "max": 10.0},
#     "min_child_samples": {"type": "int", "min": 8, "max": 16},
#     "max_bin": {"type": "int", "min": 300, "max": 400},
#     "verbose": {"type": "int", "min": 100, "max": 100}
# }

# cat_tuner = HyperparameterTuner(CatBoostClassifier, params, cat_train_x, cat_train_y, cat_features)
# cat_tuner.optimize(20)
# cat_tuner.best_params(), cat_tuner.best_score()

In [41]:
params = {
    "iterations": 413,
    "learning_rate": 0.24110432469185597,
    "depth": 10,
    "l2_leaf_reg": 8.905869555950142,
    "min_child_samples": 12,
    "max_bin": 342,
    "verbose": 100,
}

cat_final = CatBoostClassifier(**params, cat_features=cat_features)
cat_final.fit(cat_train_x, cat_train_y)
print(classification_report(cat_valid_y, cat_final.predict(cat_valid_x)))

0:	learn: 0.6079555	total: 329ms	remaining: 2m 15s
100:	learn: 0.1385176	total: 22.5s	remaining: 1m 9s
200:	learn: 0.0588177	total: 46.2s	remaining: 48.8s
300:	learn: 0.0327123	total: 1m 8s	remaining: 25.4s
400:	learn: 0.0213888	total: 1m 30s	remaining: 2.69s
412:	learn: 0.0205659	total: 1m 32s	remaining: 0us
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1060
           1       0.79      0.77      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [42]:
roc_auc_score(cat_valid_y, cat_final.predict_proba(cat_valid_x)[:, 1])

0.876698829257814

# 앙상블

In [43]:
from sklearn.ensemble import StackingClassifier

estimators = [
    # ("lr", lr_final),
    ("lgbm", lgbm_final),
    ("rf", rf_final),
    ("hgbc", hgbc_final),
    # ("ada", ada_final),
    # ("svc", svc_final),
    ("xgb", xgb_final),
]

final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators, final_estimator)
stacking_clf.fit(train_x, train_y)
print(classification_report(valid_y, stacking_clf.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [44]:
roc_auc_score(valid_y, stacking_clf.predict_proba(valid_x)[:, 1])

0.8849701049013641

In [46]:
stacking_proba = stacking_clf.predict_proba(valid_x)
cat_proba = cat_final.predict_proba(cat_valid_x)

final_proba = 0.5 * stacking_proba + 0.5 * cat_proba

final_pred = (final_proba[:, 1] >= 0.5).astype(int)
print(classification_report(valid_y, final_pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      1060
           1       0.80      0.79      0.79      1054

    accuracy                           0.80      2114
   macro avg       0.80      0.80      0.80      2114
weighted avg       0.80      0.80      0.80      2114



In [47]:
roc_auc_score(valid_y, final_proba[:, 1])

0.8907647416848663

# 테스트 데이터 예측

In [48]:
train_ft = train_ft[train_x.columns]
test_ft = test_ft[train_x.columns]
cat_train_ft = cat_train_ft[cat_train_x.columns]
cat_test_ft = cat_test_ft[cat_train_x.columns]

stacking_clf.fit(train_ft, teams_train_target["result"])
cat_final.fit(cat_train_ft, teams_train_target["result"])

stacking_test_proba = stacking_clf.predict_proba(test_ft)
cat_test_proba = cat_final.predict_proba(cat_test_ft)

final_test_proba = 0.5 * stacking_test_proba + 0.5 * cat_test_proba
final_test_pred = (final_test_proba[:, 1] >= 0.5).astype(int)

print(classification_report(teams_test_target["result"], final_test_pred))

0:	learn: 0.6275820	total: 235ms	remaining: 1m 36s
100:	learn: 0.1691727	total: 24.2s	remaining: 1m 14s
200:	learn: 0.0750971	total: 49s	remaining: 51.6s
300:	learn: 0.0393156	total: 1m 13s	remaining: 27.4s
400:	learn: 0.0253351	total: 1m 38s	remaining: 2.95s
412:	learn: 0.0243252	total: 1m 41s	remaining: 0us
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1160
           1       0.76      0.74      0.75      1164

    accuracy                           0.76      2324
   macro avg       0.76      0.76      0.76      2324
weighted avg       0.76      0.76      0.76      2324



In [49]:
roc_auc_score(teams_test_target["result"], final_test_proba[:, 1])

0.8421895366749617

# 최종 예측 모델 생성

In [50]:
import joblib

In [51]:
train_data = pd.concat([train_ft, test_ft], ignore_index=True)
cat_train_data = pd.concat([cat_train_ft, cat_test_ft], ignore_index=True)
target_data = pd.concat([teams_train_target, teams_test_target], ignore_index=True)

stacking_clf.fit(train_data, target_data["result"])
cat_final.fit(cat_train_data, target_data["result"])

joblib.dump(stacking_clf, "output/stacking_0107.pkl")
cat_final.save_model("output/cat_0107.cbm")

0:	learn: 0.6154152	total: 320ms	remaining: 2m 11s
100:	learn: 0.1750981	total: 35.6s	remaining: 1m 50s
200:	learn: 0.0849220	total: 1m 9s	remaining: 1m 13s
300:	learn: 0.0483373	total: 1m 44s	remaining: 38.9s
400:	learn: 0.0301632	total: 2m 19s	remaining: 4.18s
412:	learn: 0.0288545	total: 2m 23s	remaining: 0us


In [52]:
import json

with open("output/cat_features.json", "w") as f:
    json.dump(cat_features, f)