In [88]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")
players_train = pd.read_csv(f"{DATA_PATH}players_train.csv")
players_test = pd.read_csv(f"{DATA_PATH}players_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")
players_train_target = pd.read_csv(f"{DATA_PATH}players_train_target.csv")
players_test_target = pd.read_csv(f"{DATA_PATH}players_test_target.csv")

# 컬럼 추가

## 상대 팀 추가

In [89]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

temp_opp_players = players_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_train = pd.concat([players_train, temp_opp_players], axis=1)
temp_opp_players = players_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_test = pd.concat([players_test, temp_opp_players], axis=1)

## 날짜 추가

In [90]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

players_train["date"] = pd.to_datetime(players_train["date"])
players_test["date"] = pd.to_datetime(players_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day

players_train["year"] = players_train["date"].dt.year
players_train["month"] = players_train["date"].dt.month
players_train["day"] = players_train["date"].dt.day

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day

players_test["year"] = players_test["date"].dt.year
players_test["month"] = players_test["date"].dt.month
players_test["day"] = players_test["date"].dt.day

## 데이터 타입 변경

In [91]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

In [92]:
teams_train.head()

Unnamed: 0,gameid,league,split,playoffs,date,game,patch,side,teamname,ban1,...,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15,opp_teamname,year,month,day
0,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Blue,DRX,Diana,...,4.0,7.0,1.0,1.0,1.0,4.0,BNK FEARX,2022,1,12
1,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Red,BNK FEARX,Renekton,...,1.0,1.0,4.0,4.0,7.0,1.0,DRX,2022,1,12
2,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Blue,DRX,Diana,...,2.0,5.0,4.0,4.0,5.0,2.0,BNK FEARX,2022,1,12
3,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Red,BNK FEARX,Renekton,...,4.0,5.0,2.0,2.0,5.0,4.0,DRX,2022,1,12
4,ESPORTSTMNT01_2690705,LCK,Spring,0,2022-01-12 10:07:00,1,12.01,Blue,T1,Lee Sin,...,3.0,2.0,1.0,1.0,1.0,3.0,Kwangdong Freecs,2022,1,12


# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [93]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 19), (2324, 19))

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [94]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [95]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)
temp_test = teams_test.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f'recent10_{col}': recent10_train})
    
    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(['teamname', 'year', 'month', 'day'])
    recent10_combined = combined_data.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f'recent10_{col}': recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f'recent10_{col}'].values
    test_ft = test_ft.assign(**{f'recent10_{col}': recent10_test})
    
    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    train_ft = train_ft.assign(**{f'opp_recent10_{col}': merged_train[f'recent10_{col}_opp']})
    
    merged_test = test_ft.merge(
        combined_data[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    test_ft = test_ft.assign(**{f'opp_recent10_{col}': merged_test[f'recent10_{col}_opp']})
    
    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == 'result' else 0
    train_ft = train_ft.assign(**{
        f'recent10_{col}': train_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': train_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    test_ft = test_ft.assign(**{
        f'recent10_{col}': test_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': test_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    
    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f'recent10_{col}', f'opp_recent10_{col}'])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 77), (2324, 77))

In [96]:
train_ft.head()

Unnamed: 0,gameid,patch,side,league,teamname,opp_teamname,ban1,ban2,ban3,ban4,...,recent10_wardskilled,opp_recent10_wardskilled,recent10_wcpm,opp_recent10_wcpm,recent10_controlwardsbought,opp_recent10_controlwardsbought,recent10_visionscore,opp_recent10_visionscore,recent10_vspm,opp_recent10_vspm
0,ESPORTSTMNT01_2700815,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Twisted Fate,LeBlanc,...,0.0,35.0,0.0,1.3166,0.0,30.0,0.0,207.0,0.0,7.7868
1,ESPORTSTMNT01_2700815,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Leona,Jayce,...,35.0,45.666667,1.3166,1.464567,30.0,40.0,207.0,250.333333,7.7868,8.068033
2,ESPORTSTMNT01_2690695,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Yuumi,Samira,...,42.0,0.0,1.40225,0.0,38.5,0.0,242.5,0.0,8.11405,0.0
3,ESPORTSTMNT01_2690695,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Twisted Fate,Viktor,...,45.666667,42.0,1.464567,1.40225,40.0,38.5,250.333333,242.5,8.068033,8.11405
4,ESPORTSTMNT01_2690705,12.01,Blue,LCK,T1,Kwangdong Freecs,Lee Sin,Ryze,Viktor,LeBlanc,...,49.5,35.0,1.61275,1.3166,40.75,30.0,247.0,207.0,8.04935,7.7868


### 상대 전적

In [97]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 78), (2324, 78))

### 각 팀의 리그별 승률

In [98]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values('date')

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 각 패치 버전 사이드별 승률

In [99]:
# # 패치 버전 사이드별 승률 기록을 저장할 딕셔너리
# patch_side_records = {}
# patch_side_winrates = []

# # 날짜순으로 정렬
# combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
# combined_data = combined_data.sort_values('date')

# # 패치/사이드별 승률 계산
# for idx, match in combined_data.iterrows():
#     patch = match['patch']
#     side = match['side']
#     key = (patch, side)
    
#     # 현재 시점까지의 패치/사이드별 승률 계산
#     if key not in patch_side_records:
#         patch_side_records[key] = {'wins': 0, 'total': 0}
#         patch_side_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
#     else:
#         record = patch_side_records[key]
#         patch_side_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
#     # 현재 경기 결과 반영
#     result = match['result']
#     patch_side_records[key]['total'] += 1
#     if result == 1:
#         patch_side_records[key]['wins'] += 1

# # 계산된 승률을 훈련/테스트 데이터에 할당
# train_ft['patch_side_winrate'] = patch_side_winrates[:len(teams_train)]
# test_ft['patch_side_winrate'] = patch_side_winrates[len(teams_train):]

# # 특성 리스트에 patch_side_winrate 추가
# pre_game_features.append('patch_side_winrate')

# # 입력 데이터 업데이트
# train_ft = train_ft[pre_game_features]
# test_ft = test_ft[pre_game_features]

# train_ft.shape, test_ft.shape

### 픽 챔피언 지표

In [100]:
# df = teams_train.copy()
# df = df.sort_values(["teamname", "date", "gameid"])  # 시계열 정렬

# for slot in ["pick1", "pick2", "pick3", "pick4", "pick5"]:
#     # 1) 챔피언 컬럼 만들기
#     df_pick = df[["gameid", "teamname", "date", "result", slot]].copy()
#     df_pick.rename(columns={slot: "champion"}, inplace=True)

#     # 2) 챔피언을 category로 바꾸면 메모리 절약에 도움
#     df_pick["champion"] = df_pick["champion"].astype("category")

#     # 3) groupby + cumsum + shift(1)로 "직전까지" 누적
#     df_pick["pick_ind"] = 1
#     df_pick["win_ind"] = (df_pick["result"] == 1).astype(int)

#     df_pick["cum_pick_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["pick_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["win_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_rate"] = (
#         df_pick["cum_win_count"] / df_pick["cum_pick_count"]
#     ).fillna(0)

#     # 4) 필요한 컬럼만 남겨서, 컬럼 이름으로 바꾸기
#     df_pick = df_pick[
#         ["gameid", "teamname", "date", "champion", "cum_pick_count", "cum_win_rate"]
#     ].copy()

#     df_pick.rename(
#         columns={
#             "champion": f"{slot}_champion",  # 구분용
#             "cum_pick_count": f"{slot}_cum_pick_count",
#             "cum_win_rate": f"{slot}_cum_win_rate",
#         },
#         inplace=True,
#     )
    
#     # 5) 원본 df와 merge
#     df = pd.merge(
#         df,
#         df_pick[
#             [
#                 "gameid",
#                 "teamname",
#                 "date",
#                 f"{slot}_champion",
#                 f"{slot}_cum_pick_count",
#                 f"{slot}_cum_win_rate",
#             ]
#         ],
#         left_on=["gameid", "teamname", "date", f"{slot}"],  # 조인 키
#         right_on=["gameid", "teamname", "date", f"{slot}_champion"],
#         how="left",
#     )

# 인코딩

In [101]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [102]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()

def preprocess(train_ft, test_ft):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft

train_ft, test_ft = preprocess(train_ft, test_ft)

In [103]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [104]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

# 모델 학습 및 검증

- 하이퍼파라미터 튜닝 클래스

In [1]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}
        
        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features, logging_level="Silent")
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)
            
        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

### 모든 컬럼 형식이 number인 데이터셋

In [106]:
pre_game_features.remove("league")

cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

In [108]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

for model in models:
    scores = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
    print(f"{model.__class__.__name__} : {np.mean(scores)}, {np.std(scores)}")

LogisticRegression : 0.6663587374903772, 0.02506645631585489
LGBMClassifier : 0.7153194765204003, 0.0462211848391806
RandomForestClassifier : 0.6605080831408776, 0.03514307070211957
HistGradientBoostingClassifier : 0.7183987682832949, 0.05259057124928453
AdaBoostClassifier : 0.657428791377983, 0.034324090545970666
SVC : 0.6558891454965358, 0.024397436759804655
XGBClassifier : 0.7085450346420323, 0.03684421170357534
CatBoostClassifier : 0.7040800615858352, 0.03792845052087791


- LogisticRegression

In [109]:
params = {
    "C": {"type": "float", "min": 0.01, "max": 10},
    "penalty": {"type": "categorical", "values": ["l1", "l2"]},
    "solver": {"type": "categorical", "values": ["liblinear", "saga"]},
    "max_iter": {"type": "int", "min": 100, "max": 2000},
}

lr_vt_tuner = HyperparameterTuner(LogisticRegression, params, train_x, train_y)
lr_vt_tuner.optimize(100)
lr_vt_tuner.best_params(), lr_vt_tuner.best_score()

[I 2025-01-03 11:21:08,311] A new study created in memory with name: no-name-7cb0ad7c-73cb-4492-b286-00befb513f1d
[I 2025-01-03 11:21:08,554] Trial 0 finished with value: 0.6675904541955351 and parameters: {'C': 0.1317460656683848, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 789}. Best is trial 0 with value: 0.6675904541955351.
[I 2025-01-03 11:21:11,091] Trial 1 finished with value: 0.6671285604311008 and parameters: {'C': 1.7733593093436546, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 1304}. Best is trial 0 with value: 0.6675904541955351.
[I 2025-01-03 11:21:11,300] Trial 2 finished with value: 0.6648190916089299 and parameters: {'C': 4.2678585868923875, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 1116}. Best is trial 0 with value: 0.6675904541955351.
[I 2025-01-03 11:21:12,320] Trial 3 finished with value: 0.6638953040800615 and parameters: {'C': 4.579361530650344, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 1583}. Best is trial 0 with value: 0.6675904541955351

({'C': 0.20958163208579603,
  'penalty': 'l1',
  'solver': 'saga',
  'max_iter': 742},
 0.6705157813702849)

In [113]:
params = {"C": 0.20958163208579603, "penalty": "l1", "solver": "saga", "max_iter": 742}

lr_final = LogisticRegression(**params)
lr_final.fit(train_x, train_y)
print(classification_report(valid_y, lr_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67      1060
           1       0.67      0.75      0.70      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



- LightGBM

In [112]:
params = {
    "n_estimators": {"type": "int", "min": 100, "max": 1000},
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.3},
    "max_depth": {"type": "int", "min": 3, "max": 12},
    "num_leaves": {"type": "int", "min": 20, "max": 200},
    "min_child_samples": {"type": "int", "min": 5, "max": 50},
    "subsample": {"type": "float", "min": 0.5, "max": 1.0},
    "colsample_bytree": {"type": "float", "min": 0.5, "max": 1.0},
    "reg_alpha": {"type": "float", "min": 0.0, "max": 10.0},
    "reg_lambda": {"type": "float", "min": 0.0, "max": 10.0}
}

lgbm_tuner = HyperparameterTuner(LGBMClassifier, params, train_x, train_y)
lgbm_tuner.optimize(100)
lgbm_tuner.best_params(), lgbm_tuner.best_score()

[I 2025-01-03 11:22:46,130] A new study created in memory with name: no-name-a3df3240-885b-4e51-972f-9b4f631c8147
[I 2025-01-03 11:22:47,404] Trial 0 finished with value: 0.6905311778290992 and parameters: {'n_estimators': 799, 'learning_rate': 0.15506944178105755, 'max_depth': 9, 'num_leaves': 30, 'min_child_samples': 26, 'subsample': 0.9550371912145728, 'colsample_bytree': 0.9042697813908023, 'reg_alpha': 8.848804766079798, 'reg_lambda': 9.089250389417359}. Best is trial 0 with value: 0.6905311778290992.
[I 2025-01-03 11:22:48,747] Trial 1 finished with value: 0.7002309468822171 and parameters: {'n_estimators': 194, 'learning_rate': 0.18111781315726672, 'max_depth': 9, 'num_leaves': 136, 'min_child_samples': 34, 'subsample': 0.6601232053947443, 'colsample_bytree': 0.6212458541141999, 'reg_alpha': 0.2455777706845208, 'reg_lambda': 1.1189677590954128}. Best is trial 1 with value: 0.7002309468822171.
[I 2025-01-03 11:22:49,681] Trial 2 finished with value: 0.6896073903002309 and paramet

({'n_estimators': 127,
  'learning_rate': 0.027150124184969225,
  'max_depth': 10,
  'num_leaves': 133,
  'min_child_samples': 9,
  'subsample': 0.6209523075264632,
  'colsample_bytree': 0.5041544253141395,
  'reg_alpha': 0.005963305982194279,
  'reg_lambda': 4.654116767877415},
 0.7179368745188606)

In [115]:
params = {
    "n_estimators": 127,
    "learning_rate": 0.027150124184969225,
    "max_depth": 10,
    "num_leaves": 133,
    "min_child_samples": 9,
    "subsample": 0.6209523075264632,
    "colsample_bytree": 0.5041544253141395,
    "reg_alpha": 0.005963305982194279,
    "reg_lambda": 4.654116767877415,
}

lgbm_final = LGBMClassifier(**params)
lgbm_final.fit(train_x, train_y)
print(classification_report(valid_y, lgbm_final.predict(valid_x)))

[LightGBM] [Info] Number of positive: 3903, number of negative: 3896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11886
[LightGBM] [Info] Number of data points in the train set: 7799, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500449 -> initscore=0.001795
[LightGBM] [Info] Start training from score 0.001795
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1060
           1       0.77      0.76      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



- RandomForestClassifier

In [120]:
params = {
    "n_estimators": {"type": "int", "min": 100, "max": 1000},
    "max_depth": {"type": "int", "min": 3, "max": 20},
    "min_samples_split": {"type": "int", "min": 2, "max": 20},
    "min_samples_leaf": {"type": "int", "min": 1, "max": 10},
    "max_features": {"type": "float", "min": 0.1, "max": 1.0},
    "bootstrap": {"type": "categorical", "values": [True, False]},
    "class_weight": {"type": "categorical", "values": ["balanced", None]}

}

rf_tuner = HyperparameterTuner(RandomForestClassifier, params, train_x, train_y)
rf_tuner.optimize(30)
rf_tuner.best_params(), rf_tuner.best_score()

[I 2025-01-03 11:37:42,161] A new study created in memory with name: no-name-aa9fd0db-77b5-4822-8f64-255f5eda44f2
[I 2025-01-03 11:38:02,172] Trial 0 finished with value: 0.6675904541955351 and parameters: {'n_estimators': 961, 'max_depth': 3, 'min_samples_split': 15, 'min_samples_leaf': 8, 'max_features': 0.4110208338761414, 'bootstrap': True, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6675904541955351.
[I 2025-01-03 11:38:10,305] Trial 1 finished with value: 0.6731331793687453 and parameters: {'n_estimators': 181, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 0.1559894145607655, 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6731331793687453.
[I 2025-01-03 11:39:06,026] Trial 2 finished with value: 0.6919168591224019 and parameters: {'n_estimators': 715, 'max_depth': 17, 'min_samples_split': 11, 'min_samples_leaf': 5, 'max_features': 0.4607873149324341, 'bootstrap': True, 'class_weight': 'balanced'}. B

({'n_estimators': 954,
  'max_depth': 18,
  'min_samples_split': 19,
  'min_samples_leaf': 9,
  'max_features': 0.7814902230628112,
  'bootstrap': False,
  'class_weight': 'balanced'},
 0.7183987682832949)

In [121]:
params = {
    "n_estimators": 954,
    "max_depth": 18,
    "min_samples_split": 19,
    "min_samples_leaf": 9,
    "max_features": 0.7814902230628112,
    "bootstrap": False,
    "class_weight": "balanced",
}

rf_final = RandomForestClassifier(**params)
rf_final.fit(train_x, train_y)
print(classification_report(valid_y, rf_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1060
           1       0.76      0.80      0.78      1054

    accuracy                           0.77      2114
   macro avg       0.78      0.77      0.77      2114
weighted avg       0.78      0.77      0.77      2114



- HistGradientBoostingClassifier

In [116]:
params = {
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.3},
    "max_depth": {"type": "int", "min": 3, "max": 12},
    "max_iter": {"type": "int", "min": 100, "max": 1000},
    "min_samples_leaf": {"type": "int", "min": 5, "max": 50},
    "l2_regularization": {"type": "float", "min": 0.0, "max": 10.0},
    "max_leaf_nodes": {"type": "int", "min": 20, "max": 200}
}

hgbc_tuner = HyperparameterTuner(HistGradientBoostingClassifier, params, train_x, train_y)
hgbc_tuner.optimize(100)
hgbc_tuner.best_params(), hgbc_tuner.best_score()

[I 2025-01-03 11:32:33,796] A new study created in memory with name: no-name-30df53a0-5eac-42cd-8f51-959120cdb03a
[I 2025-01-03 11:32:36,032] Trial 0 finished with value: 0.6809853733641262 and parameters: {'learning_rate': 0.12368662118982705, 'max_depth': 3, 'max_iter': 220, 'min_samples_leaf': 37, 'l2_regularization': 4.240608087243603, 'max_leaf_nodes': 83}. Best is trial 0 with value: 0.6809853733641262.
[I 2025-01-03 11:32:49,997] Trial 1 finished with value: 0.6840646651270208 and parameters: {'learning_rate': 0.1397116003717381, 'max_depth': 10, 'max_iter': 876, 'min_samples_leaf': 39, 'l2_regularization': 6.2779912473372566, 'max_leaf_nodes': 99}. Best is trial 1 with value: 0.6840646651270208.
[I 2025-01-03 11:32:53,773] Trial 2 finished with value: 0.6911470361816783 and parameters: {'learning_rate': 0.059094809070617996, 'max_depth': 9, 'max_iter': 305, 'min_samples_leaf': 49, 'l2_regularization': 6.044351502305663, 'max_leaf_nodes': 191}. Best is trial 2 with value: 0.6911

({'learning_rate': 0.04632583641399217,
  'max_depth': 12,
  'max_iter': 144,
  'min_samples_leaf': 10,
  'l2_regularization': 1.7504806237449977,
  'max_leaf_nodes': 64},
 0.725635103926097)

In [118]:
params = {
    "learning_rate": 0.04632583641399217,
    "max_depth": 12,
    "max_iter": 144,
    "min_samples_leaf": 10,
    "l2_regularization": 1.7504806237449977,
    "max_leaf_nodes": 64,
}

hgbc_final = HistGradientBoostingClassifier(**params)
hgbc_final.fit(train_x, train_y)
print(classification_report(valid_y, hgbc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



- AdaBoostClassifier

In [123]:
params = {
    "n_estimators": {"type": "int", "min": 50, "max": 500},
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.3},
    "algorithm": {"type": "categorical", "values": ["SAMME", "SAMME.R"]}
}

ada_tuner = HyperparameterTuner(AdaBoostClassifier, params, train_x, train_y)
ada_tuner.optimize(100)
ada_tuner.best_params(), ada_tuner.best_score()

[I 2025-01-03 13:43:03,433] A new study created in memory with name: no-name-59e4f87b-eb48-4d4f-b3b7-581a7b60b773
[I 2025-01-03 13:43:14,236] Trial 0 finished with value: 0.6646651270207852 and parameters: {'n_estimators': 315, 'learning_rate': 0.09521259910377598, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.6646651270207852.
[I 2025-01-03 13:43:23,603] Trial 1 finished with value: 0.6682063125481139 and parameters: {'n_estimators': 294, 'learning_rate': 0.16987854601287597, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.6682063125481139.
[I 2025-01-03 13:43:36,991] Trial 2 finished with value: 0.6618937644341801 and parameters: {'n_estimators': 464, 'learning_rate': 0.2223483620516855, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.6682063125481139.
[I 2025-01-03 13:43:46,775] Trial 3 finished with value: 0.666974595842956 and parameters: {'n_estimators': 342, 'learning_rate': 0.2988368021210699, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.6682063

({'n_estimators': 294,
  'learning_rate': 0.16987854601287597,
  'algorithm': 'SAMME.R'},
 0.6682063125481139)

In [124]:
params = {
    "n_estimators": 294,
    "learning_rate": 0.16987854601287597,
    "algorithm": "SAMME.R",
}

ada_final = AdaBoostClassifier(**params)
ada_final.fit(train_x, train_y)
print(classification_report(valid_y, ada_final.predict(valid_x)))



              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1060
           1       0.73      0.73      0.73      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114



- SVC

In [125]:
params = {
    "C": {"type": "float", "min": 0.01, "max": 10.0},
    "kernel": {"type": "categorical", "values": ["linear", "rbf", "poly", "sigmoid"]},
    "degree": {"type": "int", "min": 2, "max": 5},
    "gamma": {"type": "float", "min": 0.0001, "max": 1.0},
    "coef0": {"type": "float", "min": 0.0, "max": 10.0},
    "class_weight": {"type": "categorical", "values": ["balanced", None]}
}

svc_tuner = HyperparameterTuner(SVC, params, train_x, train_y)
svc_tuner.optimize(100)
svc_tuner.best_params(), svc_tuner.best_score()

[I 2025-01-03 14:01:33,458] A new study created in memory with name: no-name-3a7c467e-46b5-43c0-a28e-376824470956
[I 2025-01-03 14:01:37,405] Trial 0 finished with value: 0.6649730561970747 and parameters: {'C': 6.532589110334154, 'kernel': 'linear', 'degree': 2, 'gamma': 0.08568064806975698, 'coef0': 5.066959629204042, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6649730561970747.
[I 2025-01-03 14:01:40,225] Trial 1 finished with value: 0.6666666666666667 and parameters: {'C': 2.5763117814979686, 'kernel': 'linear', 'degree': 3, 'gamma': 0.5439807994516113, 'coef0': 4.848738274760509, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6666666666666667.
[I 2025-01-03 14:01:42,807] Trial 2 finished with value: 0.6649730561970747 and parameters: {'C': 6.25169603235249, 'kernel': 'linear', 'degree': 4, 'gamma': 0.15863844468827148, 'coef0': 2.9647610950302505, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6666666666666667.
[I 2025-01-03 14:01:44,399] Tri

({'C': 0.21950805677161292,
  'kernel': 'linear',
  'degree': 4,
  'gamma': 0.671045772731431,
  'coef0': 2.7929809033044726,
  'class_weight': None},
 0.6691301000769823)

In [126]:
params = {
    "C": 0.21950805677161292,
    "kernel": "linear",
    "degree": 4,
    "gamma": 0.671045772731431,
    "coef0": 2.7929809033044726,
    "class_weight": None,
}

svc_final = SVC(**params)
svc_final.fit(train_x, train_y)
print(classification_report(valid_y, svc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.70      0.63      0.67      1060
           1       0.66      0.73      0.70      1054

    accuracy                           0.68      2114
   macro avg       0.68      0.68      0.68      2114
weighted avg       0.68      0.68      0.68      2114



- XGBClassifier

In [127]:
params = {
    "n_estimators": {"type": "int", "min": 100, "max": 1000},
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.3},
    "max_depth": {"type": "int", "min": 3, "max": 12},
    "min_child_weight": {"type": "int", "min": 1, "max": 7},
    "gamma": {"type": "float", "min": 0, "max": 1},
    "subsample": {"type": "float", "min": 0.5, "max": 1.0},
    "colsample_bytree": {"type": "float", "min": 0.5, "max": 1.0},
    "reg_alpha": {"type": "float", "min": 0.001, "max": 10},
    "reg_lambda": {"type": "float", "min": 0.001, "max": 10}
}

sgb_tuner = HyperparameterTuner(XGBClassifier, params, train_x, train_y)
sgb_tuner.optimize(100)
sgb_tuner.best_params(), sgb_tuner.best_score()

[I 2025-01-03 14:24:05,297] A new study created in memory with name: no-name-c7343ce9-9d60-432d-bf74-d6b24b289c65
[I 2025-01-03 14:24:09,827] Trial 0 finished with value: 0.6722093918398769 and parameters: {'n_estimators': 864, 'learning_rate': 0.27269470276804514, 'max_depth': 7, 'min_child_weight': 5, 'gamma': 0.18642157516717917, 'subsample': 0.7376494291115261, 'colsample_bytree': 0.5493868828747395, 'reg_alpha': 8.39945750969194, 'reg_lambda': 7.425763781716733}. Best is trial 0 with value: 0.6722093918398769.
[I 2025-01-03 14:24:13,302] Trial 1 finished with value: 0.6706697459584297 and parameters: {'n_estimators': 761, 'learning_rate': 0.1401981779848854, 'max_depth': 3, 'min_child_weight': 7, 'gamma': 0.8460686867542303, 'subsample': 0.7451601048902019, 'colsample_bytree': 0.9113247709735712, 'reg_alpha': 4.226467577293892, 'reg_lambda': 8.972982128196888}. Best is trial 0 with value: 0.6722093918398769.
[I 2025-01-03 14:24:17,218] Trial 2 finished with value: 0.68591224018475

({'n_estimators': 320,
  'learning_rate': 0.010763101596170385,
  'max_depth': 5,
  'min_child_weight': 3,
  'gamma': 0.23681972583222627,
  'subsample': 0.8952122711164143,
  'colsample_bytree': 0.9720894849351379,
  'reg_alpha': 0.08700173219794452,
  'reg_lambda': 3.713620299621038},
 0.7188606620477291)

In [128]:
params = {
    "n_estimators": 320,
    "learning_rate": 0.010763101596170385,
    "max_depth": 5,
    "min_child_weight": 3,
    "gamma": 0.23681972583222627,
    "subsample": 0.8952122711164143,
    "colsample_bytree": 0.9720894849351379,
    "reg_alpha": 0.08700173219794452,
    "reg_lambda": 3.713620299621038,
}

xgb_final = XGBClassifier(**params)
xgb_final.fit(train_x, train_y)
print(classification_report(valid_y, xgb_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.80      0.75      0.77      1060
           1       0.76      0.81      0.79      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



### category 형식 컬럼이 포함된 데이터셋

In [149]:
cutoff_patch = cat_train_ft["patch"].quantile(0.8)
train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = cat_train_ft[cat_train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = cat_train_ft[cat_train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

In [163]:
cat_features = train_x.select_dtypes("category").columns.tolist()

params = {
    "iterations": {"type": "int", "min": 100, "max": 1000},
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.3},
    "depth": {"type": "int", "min": 4, "max": 10},
    "l2_leaf_reg": {"type": "float", "min": 1, "max": 10},
    "min_child_samples": {"type": "int", "min": 5, "max": 50},
    "max_bin": {"type": "int", "min": 200, "max": 400},
    "verbose": {"type": "int", "min": 100, "max": 100}
}

cat_tuner = HyperparameterTuner(CatBoostClassifier, params, train_x, train_y, cat_features)
cat_tuner.optimize(20)
cat_tuner.best_params(), cat_tuner.best_score()

[I 2025-01-03 15:44:30,116] A new study created in memory with name: no-name-c378ea2a-ca86-4b8a-bfa2-0f857afce8c0


0:	learn: 0.6374294	total: 51.1ms	remaining: 27.1s
100:	learn: 0.3569930	total: 4.8s	remaining: 20.5s
200:	learn: 0.2552956	total: 9.61s	remaining: 15.8s
300:	learn: 0.1876246	total: 14.3s	remaining: 11s
400:	learn: 0.1412416	total: 19.1s	remaining: 6.24s
500:	learn: 0.1024402	total: 23.6s	remaining: 1.46s
531:	learn: 0.0940048	total: 25.1s	remaining: 0us


[I 2025-01-03 15:45:42,209] Trial 0 finished with value: 0.7074672825250192 and parameters: {'iterations': 532, 'learning_rate': 0.19705948309949128, 'depth': 6, 'l2_leaf_reg': 1.9050358710564357, 'min_child_samples': 9, 'max_bin': 368, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6303164	total: 19ms	remaining: 1.91s
100:	learn: 0.4190848	total: 2.21s	remaining: 21.9ms
101:	learn: 0.4185996	total: 2.23s	remaining: 0us


[I 2025-01-03 15:45:49,842] Trial 1 finished with value: 0.6937644341801386 and parameters: {'iterations': 102, 'learning_rate': 0.27022076694035896, 'depth': 4, 'l2_leaf_reg': 7.219723835628724, 'min_child_samples': 39, 'max_bin': 253, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6324082	total: 67.5ms	remaining: 11.4s
100:	learn: 0.3016073	total: 6.4s	remaining: 4.37s
169:	learn: 0.2144795	total: 10.1s	remaining: 0us


[I 2025-01-03 15:46:16,964] Trial 2 finished with value: 0.6914549653579677 and parameters: {'iterations': 170, 'learning_rate': 0.24287069200264594, 'depth': 7, 'l2_leaf_reg': 6.260674205590322, 'min_child_samples': 15, 'max_bin': 283, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6325562	total: 99.3ms	remaining: 1m 37s
100:	learn: 0.1745089	total: 12.1s	remaining: 1m 45s
200:	learn: 0.0653528	total: 24.2s	remaining: 1m 33s
300:	learn: 0.0330846	total: 36.1s	remaining: 1m 21s
400:	learn: 0.0194808	total: 48.1s	remaining: 1m 9s
500:	learn: 0.0121733	total: 1m	remaining: 57.6s
600:	learn: 0.0086606	total: 1m 12s	remaining: 45.5s
700:	learn: 0.0070331	total: 1m 23s	remaining: 33.4s
800:	learn: 0.0061521	total: 1m 35s	remaining: 21.4s
900:	learn: 0.0058394	total: 1m 47s	remaining: 9.44s
979:	learn: 0.0056860	total: 1m 57s	remaining: 0us


[I 2025-01-03 15:53:18,008] Trial 3 finished with value: 0.7059276366435718 and parameters: {'iterations': 980, 'learning_rate': 0.17840866857502644, 'depth': 9, 'l2_leaf_reg': 2.070873832591479, 'min_child_samples': 16, 'max_bin': 310, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6412436	total: 219ms	remaining: 32.3s
100:	learn: 0.1863690	total: 20.7s	remaining: 9.62s
147:	learn: 0.1282674	total: 30.3s	remaining: 0us


[I 2025-01-03 15:55:34,034] Trial 4 finished with value: 0.7005388760585066 and parameters: {'iterations': 148, 'learning_rate': 0.17269418750522209, 'depth': 10, 'l2_leaf_reg': 8.463194862893374, 'min_child_samples': 45, 'max_bin': 307, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6806225	total: 24.1ms	remaining: 8.06s
100:	learn: 0.5193382	total: 2.48s	remaining: 5.77s
200:	learn: 0.5018733	total: 5.09s	remaining: 3.42s
300:	learn: 0.4762388	total: 7.66s	remaining: 890ms
335:	learn: 0.4704335	total: 8.58s	remaining: 0us


[I 2025-01-03 15:55:58,399] Trial 5 finished with value: 0.6905311778290992 and parameters: {'iterations': 336, 'learning_rate': 0.03932799608481116, 'depth': 4, 'l2_leaf_reg': 5.9603260909487314, 'min_child_samples': 12, 'max_bin': 396, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6182551	total: 48ms	remaining: 30.8s
100:	learn: 0.2666718	total: 5.3s	remaining: 28.4s
200:	learn: 0.1474657	total: 10.4s	remaining: 22.9s
300:	learn: 0.0906665	total: 15.8s	remaining: 17.9s
400:	learn: 0.0594602	total: 20.9s	remaining: 12.6s
500:	learn: 0.0429891	total: 26.2s	remaining: 7.39s
600:	learn: 0.0314004	total: 31.3s	remaining: 2.14s
641:	learn: 0.0282352	total: 33.5s	remaining: 0us


[I 2025-01-03 15:57:40,201] Trial 6 finished with value: 0.6982294072363356 and parameters: {'iterations': 642, 'learning_rate': 0.285620927229721, 'depth': 7, 'l2_leaf_reg': 7.174644551796759, 'min_child_samples': 15, 'max_bin': 248, 'verbose': 100}. Best is trial 0 with value: 0.7074672825250192.


0:	learn: 0.6223546	total: 225ms	remaining: 1m 32s
100:	learn: 0.1408273	total: 23.5s	remaining: 1m 12s
200:	learn: 0.0563243	total: 46.9s	remaining: 49.4s
300:	learn: 0.0305627	total: 1m 10s	remaining: 26.1s
400:	learn: 0.0194663	total: 1m 33s	remaining: 2.8s
412:	learn: 0.0186581	total: 1m 36s	remaining: 0us


[I 2025-01-03 16:05:04,036] Trial 7 finished with value: 0.7091608929946112 and parameters: {'iterations': 413, 'learning_rate': 0.24110432469185597, 'depth': 10, 'l2_leaf_reg': 8.905869555950142, 'min_child_samples': 12, 'max_bin': 342, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6290736	total: 38.3ms	remaining: 26.2s
100:	learn: 0.3870281	total: 3.51s	remaining: 20.3s
200:	learn: 0.3099301	total: 6.99s	remaining: 16.9s
300:	learn: 0.2498665	total: 10.5s	remaining: 13.4s
400:	learn: 0.2056581	total: 13.9s	remaining: 9.89s
500:	learn: 0.1693519	total: 17.3s	remaining: 6.39s
600:	learn: 0.1410764	total: 20.8s	remaining: 2.95s
685:	learn: 0.1223399	total: 23.7s	remaining: 0us


[I 2025-01-03 16:06:08,209] Trial 8 finished with value: 0.7045419553502695 and parameters: {'iterations': 686, 'learning_rate': 0.238069474227145, 'depth': 5, 'l2_leaf_reg': 7.085006345765506, 'min_child_samples': 44, 'max_bin': 385, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6157708	total: 87.8ms	remaining: 1m 4s
100:	learn: 0.1329291	total: 10.8s	remaining: 1m 7s
200:	learn: 0.0472713	total: 21.3s	remaining: 56.5s
300:	learn: 0.0233726	total: 32.2s	remaining: 46.2s
400:	learn: 0.0148160	total: 42.8s	remaining: 35.5s
500:	learn: 0.0104299	total: 53.6s	remaining: 24.8s
600:	learn: 0.0089842	total: 1m 5s	remaining: 14.3s
700:	learn: 0.0073068	total: 1m 16s	remaining: 3.48s
732:	learn: 0.0070447	total: 1m 19s	remaining: 0us


[I 2025-01-03 16:10:56,802] Trial 9 finished with value: 0.7000769822940723 and parameters: {'iterations': 733, 'learning_rate': 0.2788678570491084, 'depth': 9, 'l2_leaf_reg': 4.320073285660059, 'min_child_samples': 47, 'max_bin': 293, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6623025	total: 241ms	remaining: 1m 38s
100:	learn: 0.3025695	total: 24.6s	remaining: 1m 15s
200:	learn: 0.1836016	total: 48.7s	remaining: 50.4s
300:	learn: 0.1216023	total: 1m 12s	remaining: 26.1s
400:	learn: 0.0845886	total: 1m 37s	remaining: 1.94s
408:	learn: 0.0822063	total: 1m 39s	remaining: 0us


[I 2025-01-03 16:18:16,298] Trial 10 finished with value: 0.7063895304080062 and parameters: {'iterations': 409, 'learning_rate': 0.09592566512777742, 'depth': 10, 'l2_leaf_reg': 9.661897534694884, 'min_child_samples': 27, 'max_bin': 345, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6357177	total: 40.7ms	remaining: 17s
100:	learn: 0.3477181	total: 4.04s	remaining: 12.8s
200:	learn: 0.2286040	total: 8.19s	remaining: 8.92s
300:	learn: 0.1650494	total: 12.3s	remaining: 4.86s
400:	learn: 0.1196664	total: 16.5s	remaining: 780ms
419:	learn: 0.1138337	total: 17.3s	remaining: 0us


[I 2025-01-03 16:19:05,881] Trial 11 finished with value: 0.6973056197074673 and parameters: {'iterations': 420, 'learning_rate': 0.20355392612036263, 'depth': 6, 'l2_leaf_reg': 1.1860637327442305, 'min_child_samples': 7, 'max_bin': 354, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6500330	total: 72.3ms	remaining: 38.4s
100:	learn: 0.3187424	total: 7.28s	remaining: 31.1s
200:	learn: 0.1913364	total: 14.5s	remaining: 23.8s
300:	learn: 0.1226936	total: 21.7s	remaining: 16.7s
400:	learn: 0.0869535	total: 29s	remaining: 9.47s
500:	learn: 0.0620226	total: 36.3s	remaining: 2.25s
531:	learn: 0.0561604	total: 38.6s	remaining: 0us


[I 2025-01-03 16:21:29,254] Trial 12 finished with value: 0.7063895304080062 and parameters: {'iterations': 532, 'learning_rate': 0.13358150047409056, 'depth': 8, 'l2_leaf_reg': 3.830510951032201, 'min_child_samples': 23, 'max_bin': 354, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6325525	total: 34.9ms	remaining: 10.1s
100:	learn: 0.3571426	total: 4.15s	remaining: 7.76s
200:	learn: 0.2520771	total: 8.36s	remaining: 3.7s
289:	learn: 0.1872483	total: 11.9s	remaining: 0us


[I 2025-01-03 16:22:03,969] Trial 13 finished with value: 0.703156274056967 and parameters: {'iterations': 290, 'learning_rate': 0.21037405858984537, 'depth': 6, 'l2_leaf_reg': 3.7800787762653334, 'min_child_samples': 6, 'max_bin': 333, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6550289	total: 40.4ms	remaining: 23s
100:	learn: 0.4095568	total: 4.14s	remaining: 19.2s
200:	learn: 0.3286316	total: 8.19s	remaining: 15s
300:	learn: 0.2623437	total: 12.4s	remaining: 11.1s
400:	learn: 0.2148885	total: 16.6s	remaining: 6.99s
500:	learn: 0.1761135	total: 20.6s	remaining: 2.84s
569:	learn: 0.1530227	total: 23.5s	remaining: 0us


[I 2025-01-03 16:23:12,700] Trial 14 finished with value: 0.702540415704388 and parameters: {'iterations': 570, 'learning_rate': 0.12612878319915896, 'depth': 6, 'l2_leaf_reg': 2.5939550421339153, 'min_child_samples': 22, 'max_bin': 376, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6306407	total: 58.8ms	remaining: 49.2s
100:	learn: 0.2583610	total: 6.63s	remaining: 48.4s
200:	learn: 0.1418571	total: 13.2s	remaining: 41.7s
300:	learn: 0.0862918	total: 19.8s	remaining: 35.3s
400:	learn: 0.0573152	total: 26.4s	remaining: 28.8s
500:	learn: 0.0417058	total: 33s	remaining: 22.2s
600:	learn: 0.0323531	total: 39.6s	remaining: 15.6s
700:	learn: 0.0255992	total: 46.3s	remaining: 9.05s
800:	learn: 0.0208407	total: 52.9s	remaining: 2.44s
837:	learn: 0.0194196	total: 55.4s	remaining: 0us


[I 2025-01-03 16:26:06,219] Trial 15 finished with value: 0.7053117782909931 and parameters: {'iterations': 838, 'learning_rate': 0.22566323990936496, 'depth': 8, 'l2_leaf_reg': 9.564085176740052, 'min_child_samples': 35, 'max_bin': 210, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6660875	total: 34.3ms	remaining: 17.2s
100:	learn: 0.4773678	total: 3.21s	remaining: 12.8s
200:	learn: 0.4159541	total: 6.38s	remaining: 9.59s
300:	learn: 0.3800139	total: 9.63s	remaining: 6.46s
400:	learn: 0.3447609	total: 12.9s	remaining: 3.27s
500:	learn: 0.3173355	total: 16s	remaining: 64.1ms
502:	learn: 0.3167111	total: 16.1s	remaining: 0us


[I 2025-01-03 16:26:50,465] Trial 16 finished with value: 0.6951501154734412 and parameters: {'iterations': 503, 'learning_rate': 0.08761347426905818, 'depth': 5, 'l2_leaf_reg': 4.801463447072912, 'min_child_samples': 5, 'max_bin': 326, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6403358	total: 60.7ms	remaining: 15.9s
100:	learn: 0.2883980	total: 7.74s	remaining: 12.3s
200:	learn: 0.1703837	total: 15.4s	remaining: 4.67s
261:	learn: 0.1257618	total: 19.9s	remaining: 0us


[I 2025-01-03 16:27:55,085] Trial 17 finished with value: 0.6960739030023095 and parameters: {'iterations': 262, 'learning_rate': 0.18208115089646232, 'depth': 8, 'l2_leaf_reg': 8.149283259220265, 'min_child_samples': 21, 'max_bin': 369, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6131817	total: 98.8ms	remaining: 43s
100:	learn: 0.1055968	total: 12.1s	remaining: 40.1s
200:	learn: 0.0317893	total: 24.1s	remaining: 28.2s
300:	learn: 0.0149272	total: 36.1s	remaining: 16.2s
400:	learn: 0.0085875	total: 48.2s	remaining: 4.21s
435:	learn: 0.0074640	total: 52.4s	remaining: 0us


[I 2025-01-03 16:31:12,153] Trial 18 finished with value: 0.6948421862971517 and parameters: {'iterations': 436, 'learning_rate': 0.29909074049051626, 'depth': 9, 'l2_leaf_reg': 2.694238373580623, 'min_child_samples': 12, 'max_bin': 400, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


0:	learn: 0.6267258	total: 27.4ms	remaining: 16.3s
100:	learn: 0.3609035	total: 3.3s	remaining: 16.2s
200:	learn: 0.2729347	total: 6.44s	remaining: 12.7s
300:	learn: 0.2056813	total: 9.88s	remaining: 9.72s
400:	learn: 0.1610289	total: 13.6s	remaining: 6.63s
500:	learn: 0.1301024	total: 16.9s	remaining: 3.24s
596:	learn: 0.1038996	total: 21s	remaining: 0us


[I 2025-01-03 16:32:06,096] Trial 19 finished with value: 0.7042340261739801 and parameters: {'iterations': 597, 'learning_rate': 0.2529322125660319, 'depth': 5, 'l2_leaf_reg': 1.0297359277154092, 'min_child_samples': 33, 'max_bin': 273, 'verbose': 100}. Best is trial 7 with value: 0.7091608929946112.


({'iterations': 413,
  'learning_rate': 0.24110432469185597,
  'depth': 10,
  'l2_leaf_reg': 8.905869555950142,
  'min_child_samples': 12,
  'max_bin': 342,
  'verbose': 100},
 0.7091608929946112)

In [166]:
params = {
    "iterations": 413,
    "learning_rate": 0.24110432469185597,
    "depth": 10,
    "l2_leaf_reg": 8.905869555950142,
    "min_child_samples": 12,
    "max_bin": 342,
    "verbose": 100,
}

cat_final = CatBoostClassifier(**params, cat_features=cat_features, logging_level="Silent")
cat_final.fit(train_x, train_y)
print(classification_report(valid_y, cat_final.predict(valid_x)))

0:	learn: 0.6223546	total: 232ms	remaining: 1m 35s
100:	learn: 0.1408273	total: 23.8s	remaining: 1m 13s
200:	learn: 0.0563243	total: 47.5s	remaining: 50.1s
300:	learn: 0.0305627	total: 1m 11s	remaining: 26.6s
400:	learn: 0.0194663	total: 1m 36s	remaining: 2.88s
412:	learn: 0.0186581	total: 1m 39s	remaining: 0us
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      1060
           1       0.78      0.77      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114

