In [62]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")
players_train = pd.read_csv(f"{DATA_PATH}players_train.csv")
players_test = pd.read_csv(f"{DATA_PATH}players_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")
players_train_target = pd.read_csv(f"{DATA_PATH}players_train_target.csv")
players_test_target = pd.read_csv(f"{DATA_PATH}players_test_target.csv")

# 컬럼 추가

## 상대 팀 추가

In [63]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

temp_opp_players = players_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_train = pd.concat([players_train, temp_opp_players], axis=1)
temp_opp_players = players_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_test = pd.concat([players_test, temp_opp_players], axis=1)

## 날짜 추가

In [64]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

players_train["date"] = pd.to_datetime(players_train["date"])
players_test["date"] = pd.to_datetime(players_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day

players_train["year"] = players_train["date"].dt.year
players_train["month"] = players_train["date"].dt.month
players_train["day"] = players_train["date"].dt.day

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day

players_test["year"] = players_test["date"].dt.year
players_test["month"] = players_test["date"].dt.month
players_test["day"] = players_test["date"].dt.day

## 데이터 타입 변경

In [65]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

In [66]:
teams_train.head()

Unnamed: 0,gameid,league,split,playoffs,date,game,patch,side,teamname,ban1,...,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15,opp_teamname,year,month,day
0,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Blue,DRX,Diana,...,4.0,7.0,1.0,1.0,1.0,4.0,BNK FEARX,2022,1,12
1,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Red,BNK FEARX,Renekton,...,1.0,1.0,4.0,4.0,7.0,1.0,DRX,2022,1,12
2,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Blue,DRX,Diana,...,2.0,5.0,4.0,4.0,5.0,2.0,BNK FEARX,2022,1,12
3,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Red,BNK FEARX,Renekton,...,4.0,5.0,2.0,2.0,5.0,4.0,DRX,2022,1,12
4,ESPORTSTMNT01_2690705,LCK,Spring,0,2022-01-12 10:07:00,1,12.01,Blue,T1,Lee Sin,...,3.0,2.0,1.0,1.0,1.0,3.0,Kwangdong Freecs,2022,1,12


# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [67]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 19), (2324, 19))

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [68]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [69]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)
temp_test = teams_test.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f'recent10_{col}': recent10_train})
    
    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(['teamname', 'year', 'month', 'day'])
    recent10_combined = combined_data.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f'recent10_{col}': recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f'recent10_{col}'].values
    test_ft = test_ft.assign(**{f'recent10_{col}': recent10_test})
    
    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    train_ft = train_ft.assign(**{f'opp_recent10_{col}': merged_train[f'recent10_{col}_opp']})
    
    merged_test = test_ft.merge(
        combined_data[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    test_ft = test_ft.assign(**{f'opp_recent10_{col}': merged_test[f'recent10_{col}_opp']})
    
    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == 'result' else 0
    train_ft = train_ft.assign(**{
        f'recent10_{col}': train_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': train_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    test_ft = test_ft.assign(**{
        f'recent10_{col}': test_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': test_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    
    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f'recent10_{col}', f'opp_recent10_{col}'])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 77), (2324, 77))

In [70]:
train_ft.head()

Unnamed: 0,gameid,patch,side,league,teamname,opp_teamname,ban1,ban2,ban3,ban4,...,recent10_wardskilled,opp_recent10_wardskilled,recent10_wcpm,opp_recent10_wcpm,recent10_controlwardsbought,opp_recent10_controlwardsbought,recent10_visionscore,opp_recent10_visionscore,recent10_vspm,opp_recent10_vspm
0,ESPORTSTMNT01_2700815,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Twisted Fate,LeBlanc,...,0.0,35.0,0.0,1.3166,0.0,30.0,0.0,207.0,0.0,7.7868
1,ESPORTSTMNT01_2700815,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Leona,Jayce,...,35.0,45.666667,1.3166,1.464567,30.0,40.0,207.0,250.333333,7.7868,8.068033
2,ESPORTSTMNT01_2690695,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Yuumi,Samira,...,42.0,0.0,1.40225,0.0,38.5,0.0,242.5,0.0,8.11405,0.0
3,ESPORTSTMNT01_2690695,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Twisted Fate,Viktor,...,45.666667,42.0,1.464567,1.40225,40.0,38.5,250.333333,242.5,8.068033,8.11405
4,ESPORTSTMNT01_2690705,12.01,Blue,LCK,T1,Kwangdong Freecs,Lee Sin,Ryze,Viktor,LeBlanc,...,49.5,35.0,1.61275,1.3166,40.75,30.0,247.0,207.0,8.04935,7.7868


### 상대 전적

In [71]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 78), (2324, 78))

### 각 팀의 리그별 승률

In [72]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values('date')

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 각 패치 버전 사이드별 승률

In [73]:
# # 패치 버전 사이드별 승률 기록을 저장할 딕셔너리
# patch_side_records = {}
# patch_side_winrates = []

# # 날짜순으로 정렬
# combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
# combined_data = combined_data.sort_values('date')

# # 패치/사이드별 승률 계산
# for idx, match in combined_data.iterrows():
#     patch = match['patch']
#     side = match['side']
#     key = (patch, side)
    
#     # 현재 시점까지의 패치/사이드별 승률 계산
#     if key not in patch_side_records:
#         patch_side_records[key] = {'wins': 0, 'total': 0}
#         patch_side_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
#     else:
#         record = patch_side_records[key]
#         patch_side_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
#     # 현재 경기 결과 반영
#     result = match['result']
#     patch_side_records[key]['total'] += 1
#     if result == 1:
#         patch_side_records[key]['wins'] += 1

# # 계산된 승률을 훈련/테스트 데이터에 할당
# train_ft['patch_side_winrate'] = patch_side_winrates[:len(teams_train)]
# test_ft['patch_side_winrate'] = patch_side_winrates[len(teams_train):]

# # 특성 리스트에 patch_side_winrate 추가
# pre_game_features.append('patch_side_winrate')

# # 입력 데이터 업데이트
# train_ft = train_ft[pre_game_features]
# test_ft = test_ft[pre_game_features]

# train_ft.shape, test_ft.shape

### 픽 챔피언 지표

In [74]:
# df = teams_train.copy()
# df = df.sort_values(["teamname", "date", "gameid"])  # 시계열 정렬

# for slot in ["pick1", "pick2", "pick3", "pick4", "pick5"]:
#     # 1) 챔피언 컬럼 만들기
#     df_pick = df[["gameid", "teamname", "date", "result", slot]].copy()
#     df_pick.rename(columns={slot: "champion"}, inplace=True)

#     # 2) 챔피언을 category로 바꾸면 메모리 절약에 도움
#     df_pick["champion"] = df_pick["champion"].astype("category")

#     # 3) groupby + cumsum + shift(1)로 "직전까지" 누적
#     df_pick["pick_ind"] = 1
#     df_pick["win_ind"] = (df_pick["result"] == 1).astype(int)

#     df_pick["cum_pick_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["pick_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["win_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_rate"] = (
#         df_pick["cum_win_count"] / df_pick["cum_pick_count"]
#     ).fillna(0)

#     # 4) 필요한 컬럼만 남겨서, 컬럼 이름으로 바꾸기
#     df_pick = df_pick[
#         ["gameid", "teamname", "date", "champion", "cum_pick_count", "cum_win_rate"]
#     ].copy()

#     df_pick.rename(
#         columns={
#             "champion": f"{slot}_champion",  # 구분용
#             "cum_pick_count": f"{slot}_cum_pick_count",
#             "cum_win_rate": f"{slot}_cum_win_rate",
#         },
#         inplace=True,
#     )
    
#     # 5) 원본 df와 merge
#     df = pd.merge(
#         df,
#         df_pick[
#             [
#                 "gameid",
#                 "teamname",
#                 "date",
#                 f"{slot}_champion",
#                 f"{slot}_cum_pick_count",
#                 f"{slot}_cum_win_rate",
#             ]
#         ],
#         left_on=["gameid", "teamname", "date", f"{slot}"],  # 조인 키
#         right_on=["gameid", "teamname", "date", f"{slot}_champion"],
#         how="left",
#     )

# 인코딩

In [75]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [76]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()

def preprocess(train_ft, test_ft):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft

train_ft, test_ft = preprocess(train_ft, test_ft)

In [77]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [78]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

# 모델 학습 및 검증

- 하이퍼파라미터 튜닝 클래스

In [79]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}
        
        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features)
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)
            
        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

### 모든 컬럼 형식이 number인 데이터셋

In [80]:
pre_game_features.remove("league")

cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

for model in models:
    scores = cross_val_score(model, train_x, train_y, cv=TimeSeriesSplit(5), scoring="accuracy", n_jobs=-1)
    print(f"{model.__class__.__name__} : {np.mean(scores)}, {np.std(scores)}")

LogisticRegression : 0.6663587374903772, 0.02506645631585489
LGBMClassifier : 0.7153194765204003, 0.0462211848391806
RandomForestClassifier : 0.6605080831408776, 0.03514307070211957
HistGradientBoostingClassifier : 0.7183987682832949, 0.05259057124928453
AdaBoostClassifier : 0.657428791377983, 0.034324090545970666
SVC : 0.6558891454965358, 0.024397436759804655
XGBClassifier : 0.7085450346420323, 0.03684421170357534
CatBoostClassifier : 0.7040800615858352, 0.03792845052087791


- LogisticRegression

In [21]:
params = {
    "C": {"type": "float", "min": 0.01, "max": 10},
    "penalty": {"type": "categorical", "values": ["l1", "l2"]},
    "solver": {"type": "categorical", "values": ["liblinear", "saga"]},
    "max_iter": {"type": "int", "min": 100, "max": 2000},
}

lr_vt_tuner = HyperparameterTuner(LogisticRegression, params, train_x, train_y)
lr_vt_tuner.optimize(100)
lr_vt_tuner.best_params(), lr_vt_tuner.best_score()

[I 2025-01-06 09:03:44,782] A new study created in memory with name: no-name-e237ba82-80ac-40fb-a5d9-364d36cfa360
[I 2025-01-06 09:03:47,543] Trial 0 finished with value: 0.6652809853733641 and parameters: {'C': 6.643460211427657, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 272}. Best is trial 0 with value: 0.6652809853733641.
[I 2025-01-06 09:04:00,504] Trial 1 finished with value: 0.6648190916089299 and parameters: {'C': 6.140850379802095, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 1117}. Best is trial 0 with value: 0.6652809853733641.
[I 2025-01-06 09:04:03,594] Trial 2 finished with value: 0.6657428791377983 and parameters: {'C': 3.881754436771767, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 590}. Best is trial 2 with value: 0.6657428791377983.
[I 2025-01-06 09:04:06,219] Trial 3 finished with value: 0.6629715165511932 and parameters: {'C': 6.828068755388412, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 786}. Best is trial 2 with value: 0.6657428791377983.
[I 

({'C': 0.38414964961856957,
  'penalty': 'l1',
  'solver': 'liblinear',
  'max_iter': 1903},
 0.6712856043110085)

In [81]:
params = {
    "C": 0.38414964961856957,
    "penalty": "l1",
    "solver": "liblinear",
    "max_iter": 1903
}

lr_final = LogisticRegression(**params)

lr_final.fit(train_x, train_y)

print(classification_report(valid_y, lr_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.71      0.64      0.67      1060
           1       0.67      0.73      0.70      1054

    accuracy                           0.69      2114
   macro avg       0.69      0.69      0.69      2114
weighted avg       0.69      0.69      0.69      2114



- LightGBM

In [26]:
params = {
    "n_estimators": {"type": "int", "min": 100, "max": 300},
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.1}, 
    "max_depth": {"type": "int", "min": 8, "max": 12},
    "num_leaves": {"type": "int", "min": 100, "max": 150},
    "min_child_samples": {"type": "int", "min": 5, "max": 15},
    "subsample": {"type": "float", "min": 0.5, "max": 0.7},
    "colsample_bytree": {"type": "float", "min": 0.4, "max": 0.6},
    "reg_alpha": {"type": "float", "min": 0.001, "max": 0.1},
    "reg_lambda": {"type": "float", "min": 3.0, "max": 6.0},
    "verbose": {"type": "int", "min": -1, "max": -1}
}

lgbm_tuner = HyperparameterTuner(LGBMClassifier, params, train_x, train_y)
lgbm_tuner.optimize(100)
lgbm_tuner.best_params(), lgbm_tuner.best_score()

[I 2025-01-06 09:13:24,946] A new study created in memory with name: no-name-4538e975-e6ef-4329-9c9f-ff954bc3300e
[I 2025-01-06 09:13:27,932] Trial 0 finished with value: 0.71270207852194 and parameters: {'n_estimators': 124, 'learning_rate': 0.06128646373517623, 'max_depth': 10, 'num_leaves': 100, 'min_child_samples': 14, 'subsample': 0.6868482855444905, 'colsample_bytree': 0.4816794745028867, 'reg_alpha': 0.03707125995062092, 'reg_lambda': 5.44872827687802, 'verbose': -1}. Best is trial 0 with value: 0.71270207852194.
[I 2025-01-06 09:13:34,384] Trial 1 finished with value: 0.7111624326404927 and parameters: {'n_estimators': 270, 'learning_rate': 0.0717770467477121, 'max_depth': 10, 'num_leaves': 147, 'min_child_samples': 6, 'subsample': 0.5818768464960494, 'colsample_bytree': 0.5956788537851827, 'reg_alpha': 0.060704551936466726, 'reg_lambda': 5.965449113651904, 'verbose': -1}. Best is trial 0 with value: 0.71270207852194.
[I 2025-01-06 09:13:38,080] Trial 2 finished with value: 0.7

({'n_estimators': 285,
  'learning_rate': 0.023007542937157222,
  'max_depth': 9,
  'num_leaves': 101,
  'min_child_samples': 9,
  'subsample': 0.6754653255644233,
  'colsample_bytree': 0.5153479009794544,
  'reg_alpha': 0.07512515626736012,
  'reg_lambda': 3.3370499751525755,
  'verbose': -1},
 0.7271747498075443)

In [82]:
params = {
    "n_estimators": 285,
    "learning_rate": 0.023007542937157222,
    "max_depth": 9,
    "num_leaves": 101,
    "min_child_samples": 9,
    "subsample": 0.6754653255644233,
    "colsample_bytree": 0.5153479009794544,
    "reg_alpha": 0.07512515626736012,
    "reg_lambda": 3.3370499751525755,
    "verbose": -1,
}


lgbm_final = LGBMClassifier(**params)
lgbm_final.fit(train_x, train_y)
print(classification_report(valid_y, lgbm_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1060
           1       0.78      0.79      0.79      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



- RandomForestClassifier

In [30]:
params = {
    "n_estimators": {"type": "int", "min": 800, "max": 1100},
    "max_depth": {"type": "int", "min": 15, "max": 21},
    "min_samples_split": {"type": "int", "min": 15, "max": 23},
    "min_samples_leaf": {"type": "int", "min": 7, "max": 11},
    "max_features": {"type": "float", "min": 0.7, "max": 0.85},
    "bootstrap": {"type": "categorical", "values": [False]},
    "class_weight": {"type": "categorical", "values": ["balanced"]}
}

rf_tuner = HyperparameterTuner(RandomForestClassifier, params, train_x, train_y)
rf_tuner.optimize(20)
rf_tuner.best_params(), rf_tuner.best_score()

[I 2025-01-06 09:19:14,051] A new study created in memory with name: no-name-7901a959-7c34-431f-a921-ddbe01a95791
[I 2025-01-06 09:25:25,819] Trial 0 finished with value: 0.7145496535796767 and parameters: {'n_estimators': 979, 'max_depth': 16, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 0.7250125705210512, 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7145496535796767.
[I 2025-01-06 09:31:12,397] Trial 1 finished with value: 0.7168591224018476 and parameters: {'n_estimators': 956, 'max_depth': 16, 'min_samples_split': 19, 'min_samples_leaf': 8, 'max_features': 0.7046712723377266, 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7168591224018476.
[I 2025-01-06 09:37:41,073] Trial 2 finished with value: 0.7179368745188606 and parameters: {'n_estimators': 949, 'max_depth': 19, 'min_samples_split': 22, 'min_samples_leaf': 10, 'max_features': 0.8097480916988163, 'bootstrap': False, 'class_weight': 'balanced'

({'n_estimators': 916,
  'max_depth': 21,
  'min_samples_split': 23,
  'min_samples_leaf': 9,
  'max_features': 0.8438292517251353,
  'bootstrap': False,
  'class_weight': 'balanced'},
 0.7185527328714396)

In [83]:
params = {
    "n_estimators": 954,
    "max_depth": 18,
    "min_samples_split": 19,
    "min_samples_leaf": 9,
    "max_features": 0.7814902230628112,
    "bootstrap": False,
    "class_weight": "balanced",
}

rf_final = RandomForestClassifier(**params)
rf_final.fit(train_x, train_y)
print(classification_report(valid_y, rf_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76      1060
           1       0.75      0.80      0.77      1054

    accuracy                           0.77      2114
   macro avg       0.77      0.77      0.77      2114
weighted avg       0.77      0.77      0.77      2114



- HistGradientBoostingClassifier

In [33]:
params = {
    "learning_rate": {"type": "float", "min": 0.01, "max": 0.1},
    "max_depth": {"type": "int", "min": 8, "max": 15},
    "max_iter": {"type": "int", "min": 100, "max": 300},
    "min_samples_leaf": {"type": "int", "min": 5, "max": 15},
    "l2_regularization": {"type": "float", "min": 0.5, "max": 3.0},
    "max_leaf_nodes": {"type": "int", "min": 40, "max": 90}
}

hgbc_tuner = HyperparameterTuner(HistGradientBoostingClassifier, params, train_x, train_y)
hgbc_tuner.optimize(100)
hgbc_tuner.best_params(), hgbc_tuner.best_score()

[I 2025-01-06 11:36:00,248] A new study created in memory with name: no-name-f6e06582-f708-4791-a7d8-d939124450dc
[I 2025-01-06 11:36:05,648] Trial 0 finished with value: 0.7185527328714396 and parameters: {'learning_rate': 0.029579622837584058, 'max_depth': 8, 'max_iter': 140, 'min_samples_leaf': 14, 'l2_regularization': 0.7809723320460857, 'max_leaf_nodes': 90}. Best is trial 0 with value: 0.7185527328714396.
[I 2025-01-06 11:36:16,034] Trial 1 finished with value: 0.7245573518090839 and parameters: {'learning_rate': 0.04415991087472192, 'max_depth': 13, 'max_iter': 234, 'min_samples_leaf': 7, 'l2_regularization': 2.4451110859810057, 'max_leaf_nodes': 69}. Best is trial 1 with value: 0.7245573518090839.
[I 2025-01-06 11:36:20,902] Trial 2 finished with value: 0.7240954580446497 and parameters: {'learning_rate': 0.03691044053932341, 'max_depth': 10, 'max_iter': 118, 'min_samples_leaf': 5, 'l2_regularization': 0.8683846605628738, 'max_leaf_nodes': 68}. Best is trial 1 with value: 0.724

({'learning_rate': 0.022119818280047138,
  'max_depth': 12,
  'max_iter': 276,
  'min_samples_leaf': 5,
  'l2_regularization': 0.9584267642328501,
  'max_leaf_nodes': 44},
 0.7319476520400308)

In [84]:
params = {
    "learning_rate": 0.022119818280047138,
    "max_depth": 12,
    "max_iter": 276,
    "min_samples_leaf": 5,
    "l2_regularization": 0.9584267642328501,
    "max_leaf_nodes": 44,
}


hgbc_final = HistGradientBoostingClassifier(**params)
hgbc_final.fit(train_x, train_y)
print(classification_report(valid_y, hgbc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1060
           1       0.78      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



- AdaBoostClassifier

In [43]:
params = {
    "n_estimators": {"type": "int", "min": 200, "max": 400},
    "learning_rate": {"type": "float", "min": 0.1, "max": 0.25},
    "algorithm": {"type": "categorical", "values": ["SAMME.R"]}
}

ada_tuner = HyperparameterTuner(AdaBoostClassifier, params, train_x, train_y)
ada_tuner.optimize(50)
ada_tuner.best_params(), ada_tuner.best_score()

[I 2025-01-06 11:51:35,903] A new study created in memory with name: no-name-04530b18-a2b6-4a62-bc1a-f671f8e9dac6
[I 2025-01-06 11:51:52,744] Trial 0 finished with value: 0.6637413394919169 and parameters: {'n_estimators': 281, 'learning_rate': 0.23125292711941464, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.6637413394919169.
[I 2025-01-06 11:52:15,998] Trial 1 finished with value: 0.664203233256351 and parameters: {'n_estimators': 387, 'learning_rate': 0.17861974588076562, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.664203233256351.
[I 2025-01-06 11:52:28,463] Trial 2 finished with value: 0.6657428791377983 and parameters: {'n_estimators': 210, 'learning_rate': 0.2047971292472742, 'algorithm': 'SAMME.R'}. Best is trial 2 with value: 0.6657428791377983.
[I 2025-01-06 11:52:52,123] Trial 3 finished with value: 0.6634334103156274 and parameters: {'n_estimators': 394, 'learning_rate': 0.23358916737312685, 'algorithm': 'SAMME.R'}. Best is trial 2 with value: 0.6657428

({'n_estimators': 358,
  'learning_rate': 0.13883883597100793,
  'algorithm': 'SAMME.R'},
 0.6689761354888376)

In [85]:
params = {
    "n_estimators": 358,
    "learning_rate": 0.13883883597100793,
    "algorithm": "SAMME.R",
}


ada_final = AdaBoostClassifier(**params)

ada_final.fit(train_x, train_y)

print(classification_report(valid_y, ada_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1060
           1       0.73      0.73      0.73      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114



- SVC

In [34]:
params = {
    "C": {"type": "float", "min": 0.1, "max": 0.5},
    "kernel": {"type": "categorical", "values": ["linear"]},
    "degree": {"type": "int", "min": 3, "max": 5},
    "gamma": {"type": "float", "min": 0.4, "max": 0.9},
    "coef0": {"type": "float", "min": 1.5, "max": 4.0},
    "class_weight": {"type": "categorical", "values": [None]}
}

svc_tuner = HyperparameterTuner(SVC, params, train_x, train_y)
svc_tuner.optimize(100)
svc_tuner.best_params(), svc_tuner.best_score()

[I 2025-01-06 11:44:04,484] A new study created in memory with name: no-name-a6851268-65e3-46b3-8026-6dec84b6d2f8
[I 2025-01-06 11:44:07,734] Trial 0 finished with value: 0.6671285604311008 and parameters: {'C': 0.499829361009379, 'kernel': 'linear', 'degree': 3, 'gamma': 0.4018076799113799, 'coef0': 2.3136904919586767, 'class_weight': None}. Best is trial 0 with value: 0.6671285604311008.
[I 2025-01-06 11:44:10,974] Trial 1 finished with value: 0.6669745958429563 and parameters: {'C': 0.47284336871581756, 'kernel': 'linear', 'degree': 3, 'gamma': 0.4393975121556339, 'coef0': 1.5284075248939768, 'class_weight': None}. Best is trial 0 with value: 0.6671285604311008.
[I 2025-01-06 11:44:14,156] Trial 2 finished with value: 0.6666666666666666 and parameters: {'C': 0.4376632826177409, 'kernel': 'linear', 'degree': 5, 'gamma': 0.4710733792061468, 'coef0': 3.4146985142197117, 'class_weight': None}. Best is trial 0 with value: 0.6671285604311008.
[I 2025-01-06 11:44:17,067] Trial 3 finished w

({'C': 0.12947471032485133,
  'kernel': 'linear',
  'degree': 3,
  'gamma': 0.8257983403675284,
  'coef0': 1.5244055890464454,
  'class_weight': None},
 0.6706697459584297)

In [86]:
params = {
    "C": 0.21950805677161292,
    "kernel": "linear",
    "degree": 4,
    "gamma": 0.671045772731431,
    "coef0": 2.7929809033044726,
    "class_weight": None,
}

svc_final = SVC(**params)
svc_final.fit(train_x, train_y)
print(classification_report(valid_y, svc_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.70      0.63      0.67      1060
           1       0.66      0.73      0.70      1054

    accuracy                           0.68      2114
   macro avg       0.68      0.68      0.68      2114
weighted avg       0.68      0.68      0.68      2114



- XGBClassifier

In [45]:
params = {
    "n_estimators": {"type": "int", "min": 300, "max": 400},
    "learning_rate": {"type": "float", "min": 0.005, "max": 0.02},
    "max_depth": {"type": "int", "min": 4, "max": 6},
    "min_child_weight": {"type": "int", "min": 2, "max": 4},
    "gamma": {"type": "float", "min": 0.1, "max": 0.4},
    "subsample": {"type": "float", "min": 0.8, "max": 1.0},
    "colsample_bytree": {"type": "float", "min": 0.9, "max": 1.0},
    "reg_alpha": {"type": "float", "min": 0.05, "max": 0.2},
    "reg_lambda": {"type": "float", "min": 3.0, "max": 4.5}
}

sgb_tuner = HyperparameterTuner(XGBClassifier, params, train_x, train_y)
sgb_tuner.optimize(100)
sgb_tuner.best_params(), sgb_tuner.best_score()

[I 2025-01-06 12:08:49,354] A new study created in memory with name: no-name-2deee604-b8d5-4b0a-a410-f0ecd7ce02bd
[I 2025-01-06 12:08:51,833] Trial 0 finished with value: 0.7076212471131639 and parameters: {'n_estimators': 324, 'learning_rate': 0.010826269332899338, 'max_depth': 4, 'min_child_weight': 4, 'gamma': 0.3871661046616971, 'subsample': 0.9718697674578612, 'colsample_bytree': 0.9733765919564145, 'reg_alpha': 0.08537238436857866, 'reg_lambda': 3.971684506833455}. Best is trial 0 with value: 0.7076212471131639.
[I 2025-01-06 12:08:54,181] Trial 1 finished with value: 0.7117782909930715 and parameters: {'n_estimators': 323, 'learning_rate': 0.01657846609337131, 'max_depth': 4, 'min_child_weight': 3, 'gamma': 0.22606362871510355, 'subsample': 0.9342534147669997, 'colsample_bytree': 0.9590436538877334, 'reg_alpha': 0.13672586757923827, 'reg_lambda': 3.4710133684189834}. Best is trial 1 with value: 0.7117782909930715.
[I 2025-01-06 12:08:57,380] Trial 2 finished with value: 0.719014

({'n_estimators': 337,
  'learning_rate': 0.015272630148352066,
  'max_depth': 5,
  'min_child_weight': 2,
  'gamma': 0.24988522273215766,
  'subsample': 0.9639840429354903,
  'colsample_bytree': 0.985608479043216,
  'reg_alpha': 0.1856156681311941,
  'reg_lambda': 3.4637470458659014},
 0.7219399538106235)

In [87]:
params = {
    "n_estimators": 337,
    "learning_rate": 0.015272630148352066,
    "max_depth": 5,
    "min_child_weight": 2,
    "gamma": 0.24988522273215766,
    "subsample": 0.9639840429354903,
    "colsample_bytree": 0.985608479043216,
    "reg_alpha": 0.1856156681311941,
    "reg_lambda": 3.4637470458659014,
}


xgb_final = XGBClassifier(**params)

xgb_final.fit(train_x, train_y)

print(classification_report(valid_y, xgb_final.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78      1060
           1       0.77      0.80      0.79      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



### category 형식 컬럼이 포함된 데이터셋

In [88]:
cutoff_patch = cat_train_ft["patch"].quantile(0.8)
train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

cat_train_x = cat_train_ft[cat_train_ft["gameid"].isin(train_games)][pre_game_features]
cat_valid_x = cat_train_ft[cat_train_ft["gameid"].isin(valid_games)][pre_game_features]

cat_train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
cat_valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

cat_train_x.drop(columns=["gameid"], inplace=True)
cat_valid_x.drop(columns=["gameid"], inplace=True)

In [53]:
cat_features = train_x.select_dtypes("category").columns.tolist()

params = {
    "iterations": {"type": "int", "min": 300, "max": 600},
    "learning_rate": {"type": "float", "min": 0.15, "max": 0.3},
    "depth": {"type": "int", "min": 8, "max": 12},
    "l2_leaf_reg": {"type": "float", "min": 6.0, "max": 10.0},
    "min_child_samples": {"type": "int", "min": 8, "max": 16},
    "max_bin": {"type": "int", "min": 300, "max": 400},
    "verbose": {"type": "int", "min": 100, "max": 100}
}

cat_tuner = HyperparameterTuner(CatBoostClassifier, params, cat_train_x, cat_train_y, cat_features)
cat_tuner.optimize(20)
cat_tuner.best_params(), cat_tuner.best_score()

[I 2025-01-06 12:28:59,908] A new study created in memory with name: no-name-9eb9fa3a-fdaa-4d84-a3e7-ef1324362fe0


0:	learn: 0.6074296	total: 535ms	remaining: 3m 55s
100:	learn: 0.0669953	total: 50.6s	remaining: 2m 49s
200:	learn: 0.0245080	total: 1m 40s	remaining: 1m 59s
300:	learn: 0.0137688	total: 2m 31s	remaining: 1m 9s
400:	learn: 0.0093784	total: 3m 21s	remaining: 19.6s
439:	learn: 0.0085326	total: 3m 41s	remaining: 0us


[I 2025-01-06 12:47:40,564] Trial 0 finished with value: 0.7057736720554273 and parameters: {'iterations': 440, 'learning_rate': 0.2995173570725813, 'depth': 11, 'l2_leaf_reg': 8.29765347797401, 'min_child_samples': 9, 'max_bin': 399, 'verbose': 100}. Best is trial 0 with value: 0.7057736720554273.


0:	learn: 0.6251412	total: 62.1ms	remaining: 32s
100:	learn: 0.2489966	total: 6.64s	remaining: 27.3s
200:	learn: 0.1356704	total: 13.1s	remaining: 20.6s
300:	learn: 0.0816938	total: 19.7s	remaining: 14.1s
400:	learn: 0.0563121	total: 26.5s	remaining: 7.59s
500:	learn: 0.0387528	total: 33.3s	remaining: 997ms
515:	learn: 0.0367757	total: 34.3s	remaining: 0us


[I 2025-01-06 12:49:35,028] Trial 1 finished with value: 0.7006928406466513 and parameters: {'iterations': 516, 'learning_rate': 0.24269263413622716, 'depth': 8, 'l2_leaf_reg': 9.667157134801382, 'min_child_samples': 13, 'max_bin': 311, 'verbose': 100}. Best is trial 0 with value: 0.7057736720554273.


0:	learn: 0.6375491	total: 855ms	remaining: 7m 2s
100:	learn: 0.0988372	total: 1m 25s	remaining: 5m 31s
200:	learn: 0.0409335	total: 2m 47s	remaining: 4m 4s
300:	learn: 0.0231435	total: 4m 8s	remaining: 2m 40s
400:	learn: 0.0157337	total: 5m 31s	remaining: 1m 17s
494:	learn: 0.0123490	total: 6m 47s	remaining: 0us


[I 2025-01-06 13:25:59,574] Trial 2 finished with value: 0.7050038491147037 and parameters: {'iterations': 495, 'learning_rate': 0.17631833072293768, 'depth': 12, 'l2_leaf_reg': 7.800219785937266, 'min_child_samples': 8, 'max_bin': 335, 'verbose': 100}. Best is trial 0 with value: 0.7057736720554273.


0:	learn: 0.6445191	total: 112ms	remaining: 55.1s
100:	learn: 0.2630373	total: 9.8s	remaining: 38.1s
200:	learn: 0.1394091	total: 19.8s	remaining: 28.9s
300:	learn: 0.0789794	total: 29.8s	remaining: 19.1s
400:	learn: 0.0520605	total: 40.6s	remaining: 9.41s
493:	learn: 0.0374867	total: 50s	remaining: 0us


[I 2025-01-06 13:29:05,384] Trial 3 finished with value: 0.7071593533487298 and parameters: {'iterations': 494, 'learning_rate': 0.15114252780839607, 'depth': 9, 'l2_leaf_reg': 6.342611166296036, 'min_child_samples': 9, 'max_bin': 387, 'verbose': 100}. Best is trial 3 with value: 0.7071593533487298.


0:	learn: 0.6195973	total: 55.5ms	remaining: 18.4s
100:	learn: 0.2205912	total: 5.92s	remaining: 13.6s
200:	learn: 0.1083739	total: 11.9s	remaining: 7.83s
300:	learn: 0.0594196	total: 18s	remaining: 1.91s
332:	learn: 0.0501121	total: 19.9s	remaining: 0us


[I 2025-01-06 13:30:11,337] Trial 4 finished with value: 0.6956120092378753 and parameters: {'iterations': 333, 'learning_rate': 0.26317157730217705, 'depth': 8, 'l2_leaf_reg': 6.869464220820704, 'min_child_samples': 11, 'max_bin': 305, 'verbose': 100}. Best is trial 3 with value: 0.7071593533487298.


0:	learn: 0.6450497	total: 57.7ms	remaining: 22.2s
100:	learn: 0.3084938	total: 6.28s	remaining: 17.7s
200:	learn: 0.1987116	total: 12.6s	remaining: 11.6s
300:	learn: 0.1328884	total: 19.1s	remaining: 5.39s
385:	learn: 0.0990253	total: 24.5s	remaining: 0us


[I 2025-01-06 13:31:34,151] Trial 5 finished with value: 0.6976135488837567 and parameters: {'iterations': 386, 'learning_rate': 0.16629592110384378, 'depth': 8, 'l2_leaf_reg': 9.417213311906277, 'min_child_samples': 10, 'max_bin': 379, 'verbose': 100}. Best is trial 3 with value: 0.7071593533487298.


0:	learn: 0.6078840	total: 192ms	remaining: 58.3s
100:	learn: 0.0910850	total: 18.8s	remaining: 37.8s
200:	learn: 0.0360261	total: 37.4s	remaining: 19.1s
300:	learn: 0.0201726	total: 56s	remaining: 558ms
303:	learn: 0.0197543	total: 56.5s	remaining: 0us


[I 2025-01-06 13:35:41,503] Trial 6 finished with value: 0.6933025404157045 and parameters: {'iterations': 304, 'learning_rate': 0.29731438090606754, 'depth': 10, 'l2_leaf_reg': 6.661586768917676, 'min_child_samples': 13, 'max_bin': 313, 'verbose': 100}. Best is trial 3 with value: 0.7071593533487298.


0:	learn: 0.6437308	total: 183ms	remaining: 1m 33s
100:	learn: 0.1921732	total: 18.4s	remaining: 1m 15s
200:	learn: 0.0879986	total: 36.7s	remaining: 57.2s
300:	learn: 0.0481771	total: 55.5s	remaining: 39.2s
400:	learn: 0.0304668	total: 1m 14s	remaining: 21s
500:	learn: 0.0216971	total: 1m 32s	remaining: 2.4s
513:	learn: 0.0207202	total: 1m 34s	remaining: 0us


[I 2025-01-06 13:42:32,709] Trial 7 finished with value: 0.7113163972286374 and parameters: {'iterations': 514, 'learning_rate': 0.15194132703849916, 'depth': 10, 'l2_leaf_reg': 6.081800959954002, 'min_child_samples': 12, 'max_bin': 307, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6344402	total: 384ms	remaining: 3m 4s
100:	learn: 0.1172712	total: 38.1s	remaining: 2m 23s
200:	learn: 0.0463840	total: 1m 14s	remaining: 1m 44s
300:	learn: 0.0247235	total: 1m 51s	remaining: 1m 6s
400:	learn: 0.0164205	total: 2m 28s	remaining: 29.3s
479:	learn: 0.0127568	total: 2m 57s	remaining: 0us


[I 2025-01-06 13:57:17,419] Trial 8 finished with value: 0.7073133179368746 and parameters: {'iterations': 480, 'learning_rate': 0.18549316743532782, 'depth': 11, 'l2_leaf_reg': 7.107426006673945, 'min_child_samples': 8, 'max_bin': 300, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6459398	total: 419ms	remaining: 3m 6s
100:	learn: 0.1628868	total: 39.8s	remaining: 2m 16s
200:	learn: 0.0769975	total: 1m 19s	remaining: 1m 36s
300:	learn: 0.0435044	total: 1m 58s	remaining: 57.4s
400:	learn: 0.0285141	total: 2m 37s	remaining: 18.1s
446:	learn: 0.0243526	total: 2m 55s	remaining: 0us


[I 2025-01-06 14:12:50,167] Trial 9 finished with value: 0.7113163972286374 and parameters: {'iterations': 447, 'learning_rate': 0.15577343345243624, 'depth': 11, 'l2_leaf_reg': 9.728561488960356, 'min_child_samples': 12, 'max_bin': 331, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6263850	total: 246ms	remaining: 2m 25s
100:	learn: 0.1445097	total: 23.7s	remaining: 1m 55s
200:	learn: 0.0524969	total: 46.5s	remaining: 1m 31s
300:	learn: 0.0278023	total: 1m 9s	remaining: 1m 8s
400:	learn: 0.0169182	total: 1m 33s	remaining: 45.4s
500:	learn: 0.0124629	total: 1m 57s	remaining: 22s
594:	learn: 0.0098932	total: 2m 17s	remaining: 0us


[I 2025-01-06 14:22:58,922] Trial 10 finished with value: 0.7006928406466513 and parameters: {'iterations': 595, 'learning_rate': 0.21159514835411367, 'depth': 10, 'l2_leaf_reg': 6.0081858346024735, 'min_child_samples': 16, 'max_bin': 362, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6331129	total: 484ms	remaining: 4m 35s
100:	learn: 0.1166960	total: 42.2s	remaining: 3m 16s
200:	learn: 0.0475680	total: 1m 24s	remaining: 2m 34s
300:	learn: 0.0284066	total: 2m 5s	remaining: 1m 52s
400:	learn: 0.0187678	total: 2m 47s	remaining: 1m 10s
500:	learn: 0.0145606	total: 3m 29s	remaining: 28.8s
569:	learn: 0.0122629	total: 3m 57s	remaining: 0us


[I 2025-01-06 14:42:56,515] Trial 11 finished with value: 0.7065434949961509 and parameters: {'iterations': 570, 'learning_rate': 0.19798304111731785, 'depth': 11, 'l2_leaf_reg': 8.774959894410248, 'min_child_samples': 12, 'max_bin': 332, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6434850	total: 874ms	remaining: 6m 7s
100:	learn: 0.1114412	total: 1m 27s	remaining: 4m 38s
200:	learn: 0.0451776	total: 2m 54s	remaining: 3m 11s
300:	learn: 0.0266051	total: 4m 21s	remaining: 1m 44s
400:	learn: 0.0180204	total: 5m 51s	remaining: 17.6s
420:	learn: 0.0167847	total: 6m 9s	remaining: 0us


[I 2025-01-06 15:15:07,976] Trial 12 finished with value: 0.7063895304080061 and parameters: {'iterations': 421, 'learning_rate': 0.1529115724953487, 'depth': 12, 'l2_leaf_reg': 7.593648998629042, 'min_child_samples': 15, 'max_bin': 329, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6268868	total: 102ms	remaining: 55.3s
100:	learn: 0.1986649	total: 9.69s	remaining: 42.7s
200:	learn: 0.0953336	total: 19.4s	remaining: 33.3s
300:	learn: 0.0538561	total: 29.1s	remaining: 23.7s
400:	learn: 0.0352747	total: 38.9s	remaining: 14.1s
500:	learn: 0.0244434	total: 48.5s	remaining: 4.36s
545:	learn: 0.0214681	total: 52.9s	remaining: 0us


[I 2025-01-06 15:18:32,072] Trial 13 finished with value: 0.7020785219399539 and parameters: {'iterations': 546, 'learning_rate': 0.22529017582418726, 'depth': 9, 'l2_leaf_reg': 8.904508583226104, 'min_child_samples': 14, 'max_bin': 351, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6363957	total: 206ms	remaining: 1m 18s
100:	learn: 0.1761823	total: 20.6s	remaining: 57.6s
200:	learn: 0.0813387	total: 40.5s	remaining: 36.7s
300:	learn: 0.0478536	total: 1m	remaining: 16.5s
382:	learn: 0.0349683	total: 1m 16s	remaining: 0us


[I 2025-01-06 15:24:16,555] Trial 14 finished with value: 0.7042340261739801 and parameters: {'iterations': 383, 'learning_rate': 0.19160462740122866, 'depth': 10, 'l2_leaf_reg': 9.892070750656751, 'min_child_samples': 11, 'max_bin': 321, 'verbose': 100}. Best is trial 7 with value: 0.7113163972286374.


0:	learn: 0.6401056	total: 458ms	remaining: 3m 26s
100:	learn: 0.1380259	total: 44s	remaining: 2m 33s
200:	learn: 0.0559322	total: 1m 27s	remaining: 1m 49s
300:	learn: 0.0307073	total: 2m 11s	remaining: 1m 6s
400:	learn: 0.0205095	total: 2m 55s	remaining: 22.7s
452:	learn: 0.0172874	total: 3m 17s	remaining: 0us


[I 2025-01-06 15:40:48,868] Trial 15 finished with value: 0.71270207852194 and parameters: {'iterations': 453, 'learning_rate': 0.17079423012043105, 'depth': 11, 'l2_leaf_reg': 8.338621256706169, 'min_child_samples': 12, 'max_bin': 345, 'verbose': 100}. Best is trial 15 with value: 0.71270207852194.


0:	learn: 0.6397852	total: 90.5ms	remaining: 48.8s
100:	learn: 0.2392075	total: 9.92s	remaining: 43.1s
200:	learn: 0.1301606	total: 19.8s	remaining: 33.4s
300:	learn: 0.0789576	total: 29.8s	remaining: 23.6s
400:	learn: 0.0525590	total: 39.7s	remaining: 13.8s
500:	learn: 0.0374058	total: 49.7s	remaining: 3.87s
539:	learn: 0.0328481	total: 53.5s	remaining: 0us


[I 2025-01-06 15:44:08,229] Trial 16 finished with value: 0.7096227867590454 and parameters: {'iterations': 540, 'learning_rate': 0.1729325477916729, 'depth': 9, 'l2_leaf_reg': 8.287728988111189, 'min_child_samples': 14, 'max_bin': 352, 'verbose': 100}. Best is trial 15 with value: 0.71270207852194.


0:	learn: 0.6272988	total: 950ms	remaining: 6m 11s
100:	learn: 0.0722866	total: 1m 35s	remaining: 4m 34s
200:	learn: 0.0283104	total: 3m 9s	remaining: 2m 59s
300:	learn: 0.0160611	total: 4m 43s	remaining: 1m 25s
391:	learn: 0.0112594	total: 6m 9s	remaining: 0us


[I 2025-01-06 16:17:08,133] Trial 17 finished with value: 0.7083910700538876 and parameters: {'iterations': 392, 'learning_rate': 0.20962486328878244, 'depth': 12, 'l2_leaf_reg': 7.356122938041867, 'min_child_samples': 11, 'max_bin': 361, 'verbose': 100}. Best is trial 15 with value: 0.71270207852194.


0:	learn: 0.6208118	total: 443ms	remaining: 3m 28s
100:	learn: 0.0872171	total: 46.6s	remaining: 2m 50s
200:	learn: 0.0358523	total: 1m 34s	remaining: 2m 7s
300:	learn: 0.0196501	total: 2m 21s	remaining: 1m 19s
400:	learn: 0.0134136	total: 3m 9s	remaining: 33s
470:	learn: 0.0107969	total: 3m 41s	remaining: 0us


[I 2025-01-06 16:35:02,023] Trial 18 finished with value: 0.7091608929946113 and parameters: {'iterations': 471, 'learning_rate': 0.24863305180878753, 'depth': 11, 'l2_leaf_reg': 8.901025719443721, 'min_child_samples': 13, 'max_bin': 342, 'verbose': 100}. Best is trial 15 with value: 0.71270207852194.


0:	learn: 0.6433236	total: 228ms	remaining: 1m 59s
100:	learn: 0.1817468	total: 23.5s	remaining: 1m 38s
200:	learn: 0.0854421	total: 46.7s	remaining: 1m 15s
300:	learn: 0.0496636	total: 1m 10s	remaining: 52.3s
400:	learn: 0.0316539	total: 1m 33s	remaining: 28.8s
500:	learn: 0.0236715	total: 1m 56s	remaining: 5.56s
524:	learn: 0.0223075	total: 2m 1s	remaining: 0us


[I 2025-01-06 16:44:23,919] Trial 19 finished with value: 0.7065434949961509 and parameters: {'iterations': 525, 'learning_rate': 0.16848194324615595, 'depth': 10, 'l2_leaf_reg': 8.145432123276661, 'min_child_samples': 10, 'max_bin': 372, 'verbose': 100}. Best is trial 15 with value: 0.71270207852194.


({'iterations': 453,
  'learning_rate': 0.17079423012043105,
  'depth': 11,
  'l2_leaf_reg': 8.338621256706169,
  'min_child_samples': 12,
  'max_bin': 345,
  'verbose': 100},
 0.71270207852194)

In [89]:
params = {
    "iterations": 413,
    "learning_rate": 0.24110432469185597,
    "depth": 10,
    "l2_leaf_reg": 8.905869555950142,
    "min_child_samples": 12,
    "max_bin": 342,
    "verbose": 100,
}

cat_final = CatBoostClassifier(**params, cat_features=cat_features)
cat_final.fit(cat_train_x, cat_train_y)
print(classification_report(cat_valid_y, cat_final.predict(cat_valid_x)))

0:	learn: 0.6223546	total: 208ms	remaining: 1m 25s
100:	learn: 0.1408273	total: 20.8s	remaining: 1m 4s
200:	learn: 0.0563243	total: 41.6s	remaining: 43.9s
300:	learn: 0.0305627	total: 1m 2s	remaining: 23.1s
400:	learn: 0.0194663	total: 1m 22s	remaining: 2.48s
412:	learn: 0.0186581	total: 1m 25s	remaining: 0us
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      1060
           1       0.78      0.77      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



# 앙상블

In [91]:
from sklearn.ensemble import StackingClassifier

estimators = [
    # ("lr", lr_final),
    ("lgbm", lgbm_final),
    ("rf", rf_final),
    ("hgbc", hgbc_final),
    # ("ada", ada_final),
    # ("svc", svc_final),
    ("xgb", xgb_final),
]

final_estimator = LogisticRegression(random_state=SEED)
stacking_clf = StackingClassifier(estimators, final_estimator)
stacking_clf.fit(train_x, train_y)
print(classification_report(valid_y, stacking_clf.predict(valid_x)))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79      1060
           1       0.78      0.78      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114



In [92]:
stacking_proba = stacking_clf.predict_proba(valid_x)
cat_proba = cat_final.predict_proba(cat_valid_x)

final_proba = 0.5 * stacking_proba + 0.5 * cat_proba

final_pred = (final_proba[:, 1] >= 0.5).astype(int)
print(classification_report(valid_y, final_pred))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      1060
           1       0.79      0.79      0.79      1054

    accuracy                           0.79      2114
   macro avg       0.79      0.79      0.79      2114
weighted avg       0.79      0.79      0.79      2114

