In [15]:
import pandas as pd
import numpy as np

DATA_PATH = "LoLesports_data/"
SEED = 42

teams_train = pd.read_csv(f"{DATA_PATH}teams_train.csv")
teams_test = pd.read_csv(f"{DATA_PATH}teams_test.csv")
players_train = pd.read_csv(f"{DATA_PATH}players_train.csv")
players_test = pd.read_csv(f"{DATA_PATH}players_test.csv")

teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")
players_train_target = pd.read_csv(f"{DATA_PATH}players_train_target.csv")
players_test_target = pd.read_csv(f"{DATA_PATH}players_test_target.csv")

# 컬럼 추가

## 상대 팀 추가

In [16]:
temp_opp_teams = teams_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_train = pd.concat([teams_train, temp_opp_teams], axis=1)
temp_opp_teams = teams_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
teams_test = pd.concat([teams_test, temp_opp_teams], axis=1)

temp_opp_players = players_train.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_train = pd.concat([players_train, temp_opp_players], axis=1)
temp_opp_players = players_test.groupby("gameid")["teamname"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamname")
players_test = pd.concat([players_test, temp_opp_players], axis=1)

## 날짜 추가

In [17]:
teams_train["date"] = pd.to_datetime(teams_train["date"])
teams_test["date"] = pd.to_datetime(teams_test["date"])

players_train["date"] = pd.to_datetime(players_train["date"])
players_test["date"] = pd.to_datetime(players_test["date"])

teams_train["year"] = teams_train["date"].dt.year
teams_train["month"] = teams_train["date"].dt.month
teams_train["day"] = teams_train["date"].dt.day

players_train["year"] = players_train["date"].dt.year
players_train["month"] = players_train["date"].dt.month
players_train["day"] = players_train["date"].dt.day

teams_test["year"] = teams_test["date"].dt.year
teams_test["month"] = teams_test["date"].dt.month
teams_test["day"] = teams_test["date"].dt.day

players_test["year"] = players_test["date"].dt.year
players_test["month"] = players_test["date"].dt.month
players_test["day"] = players_test["date"].dt.day

## 데이터 타입 변경

In [18]:
cols = ["league", "split", "teamname", "opp_teamname", "ban1", "ban2", "ban3", "ban4", "ban5", "pick1", "pick2", "pick3", "pick4", "pick5"]

teams_train[cols] = teams_train[cols].astype("category")
teams_test[cols] = teams_test[cols].astype("category")

In [19]:
teams_train.head()

Unnamed: 0,gameid,league,split,playoffs,date,game,patch,side,teamname,ban1,...,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15,opp_teamname,year,month,day
0,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Blue,DRX,Diana,...,4.0,7.0,1.0,1.0,1.0,4.0,BNK FEARX,2022,1,12
1,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 06:20:00,1,12.01,Red,BNK FEARX,Renekton,...,1.0,1.0,4.0,4.0,7.0,1.0,DRX,2022,1,12
2,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Blue,DRX,Diana,...,2.0,5.0,4.0,4.0,5.0,2.0,BNK FEARX,2022,1,12
3,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 09:02:00,2,12.01,Red,BNK FEARX,Renekton,...,4.0,5.0,2.0,2.0,5.0,4.0,DRX,2022,1,12
4,ESPORTSTMNT01_2690705,LCK,Spring,0,2022-01-12 10:07:00,1,12.01,Blue,T1,Lee Sin,...,3.0,2.0,1.0,1.0,1.0,3.0,Kwangdong Freecs,2022,1,12


# 특성 추가

## df에 포함되어 있는 특성을 이용한 토대 작성

In [20]:
pre_game_features = [
    "gameid",
    "patch",
    "side",
    "league",
    "teamname",
    "opp_teamname",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "year",
    "month",
    "day",
]

train_ft = teams_train[pre_game_features]
test_ft = teams_test[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 19), (2324, 19))

### 팀별 최근 10경기 지표 계산, 상대팀 최근 10경기 지표 계산

In [21]:
stats_columns = [
    "result",
    "gamelength",
    "kills",
    "deaths",
    "assists",
    "firstblood",
    "team kpm",
    "ckpm",
    "firstdragon",
    "firstherald",
    "void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "inhibitors",
    "damagetochampions",
    "dpm",
    "damagetakenperminute",
    "damagemitigatedperminute",
    "wardsplaced",
    "wpm",
    "wardskilled",
    "wcpm",
    "controlwardsbought",
    "visionscore",
    "vspm",
]

In [22]:
# 팀별 최근 승률 계산을 위한 데이터 정렬
temp_train = teams_train.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)
temp_test = teams_test.sort_values(['teamname', 'year', 'month', 'day']).reset_index(drop=True)

# 팀별 최근 10경기 평균 계산
for col in stats_columns:
    # 승률 계산
    recent10_train = temp_train.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    train_ft = train_ft.assign(**{f'recent10_{col}': recent10_train})
    
    # 테스트 데이터의 지표 계산을 위해 훈련 데이터와 테스트 데이터 결합
    combined_data = pd.concat([temp_train, temp_test], ignore_index=True).sort_values(['teamname', 'year', 'month', 'day'])
    recent10_combined = combined_data.groupby('teamname', observed=True)[col].transform(
        lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
    )
    combined_data = combined_data.assign(**{f'recent10_{col}': recent10_combined})

    # 테스트 데이터의 지표 업데이트
    recent10_test = combined_data.tail(len(temp_test))[f'recent10_{col}'].values
    test_ft = test_ft.assign(**{f'recent10_{col}': recent10_test})
    
    # 상대팀 최근 지표 계산
    merged_train = train_ft.merge(
        train_ft[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    train_ft = train_ft.assign(**{f'opp_recent10_{col}': merged_train[f'recent10_{col}_opp']})
    
    merged_test = test_ft.merge(
        combined_data[['teamname', 'year', 'month', 'day', f'recent10_{col}']], 
        left_on=['opp_teamname', 'year', 'month', 'day'],
        right_on=['teamname', 'year', 'month', 'day'],
        suffixes=('', '_opp')
    )
    test_ft = test_ft.assign(**{f'opp_recent10_{col}': merged_test[f'recent10_{col}_opp']})
    
    # NaN값 처리 (첫 경기인 경우)
    default_value = 0.5 if col == 'result' else 0
    train_ft = train_ft.assign(**{
        f'recent10_{col}': train_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': train_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    test_ft = test_ft.assign(**{
        f'recent10_{col}': test_ft[f'recent10_{col}'].fillna(default_value),
        f'opp_recent10_{col}': test_ft[f'opp_recent10_{col}'].fillna(default_value)
    })
    
    # 특성 리스트에 새로운 지표 추가
    pre_game_features.extend([f'recent10_{col}', f'opp_recent10_{col}'])

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 77), (2324, 77))

In [23]:
train_ft.head()

Unnamed: 0,gameid,patch,side,league,teamname,opp_teamname,ban1,ban2,ban3,ban4,...,recent10_wardskilled,opp_recent10_wardskilled,recent10_wcpm,opp_recent10_wcpm,recent10_controlwardsbought,opp_recent10_controlwardsbought,recent10_visionscore,opp_recent10_visionscore,recent10_vspm,opp_recent10_vspm
0,ESPORTSTMNT01_2700815,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Twisted Fate,LeBlanc,...,0.0,35.0,0.0,1.3166,0.0,30.0,0.0,207.0,0.0,7.7868
1,ESPORTSTMNT01_2700815,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Leona,Jayce,...,35.0,45.666667,1.3166,1.464567,30.0,40.0,207.0,250.333333,7.7868,8.068033
2,ESPORTSTMNT01_2690695,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Yuumi,Samira,...,42.0,0.0,1.40225,0.0,38.5,0.0,242.5,0.0,8.11405,0.0
3,ESPORTSTMNT01_2690695,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Twisted Fate,Viktor,...,45.666667,42.0,1.464567,1.40225,40.0,38.5,250.333333,242.5,8.068033,8.11405
4,ESPORTSTMNT01_2690705,12.01,Blue,LCK,T1,Kwangdong Freecs,Lee Sin,Ryze,Viktor,LeBlanc,...,49.5,35.0,1.61275,1.3166,40.75,30.0,247.0,207.0,8.04935,7.7868


### 상대 전적

In [24]:
# 팀별 맞대결 기록을 시간순으로 계산
h2h_records = {}

# 훈련 데이터와 테스트 데이터 결합 후 시간순 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values(['year', 'month', 'day'])

# 각 경기마다 이전 맞대결 기록 계산
h2h_winrates = []

for idx, match in combined_data.iterrows():
    team1, team2 = match['teamname'], match['opp_teamname']
    year = match['year']
    key = (team1, team2, year)
    
    # 현재 시점까지의 맞대결 기록 저장
    if key not in h2h_records:
        h2h_records[key] = {'wins': 0, 'total': 0}
        h2h_winrates.append(0.5)  # 첫 맞대결인 경우 0.5 반환
    else:
        record = h2h_records[key]
        h2h_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    h2h_records[key]['total'] += 1
    if result == 1:
        h2h_records[key]['wins'] += 1
        
    # 상대팀 관점의 기록도 업데이트
    key_reverse = (team2, team1, year)
    if key_reverse not in h2h_records:
        h2h_records[key_reverse] = {'wins': 0, 'total': 0}
    h2h_records[key_reverse]['total'] += 1
    if result == 0:
        h2h_records[key_reverse]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['h2h_winrate'] = h2h_winrates[:len(teams_train)]
test_ft['h2h_winrate'] = h2h_winrates[len(teams_train):]

# 특성 리스트에 h2h_winrate 추가
pre_game_features.append('h2h_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 78), (2324, 78))

### 각 팀의 리그별 승률

In [25]:
# 팀별 리그 승률 기록을 저장할 딕셔너리
league_records = {}
league_winrates = []

# 날짜순으로 정렬
combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
combined_data = combined_data.sort_values('date')

# 훈련 데이터에서 팀별 리그 승률 계산
for idx, match in combined_data.iterrows():
    team = match['teamname']
    league = match['league']
    year = match['year']
    key = (team, league, year)
    
    # 현재 시점까지의 리그 승률 계산
    if key not in league_records:
        league_records[key] = {'wins': 0, 'total': 0}
        league_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
    else:
        record = league_records[key]
        league_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
    # 현재 경기 결과 반영
    result = match['result']
    league_records[key]['total'] += 1
    if result == 1:
        league_records[key]['wins'] += 1

# 계산된 승률을 훈련/테스트 데이터에 할당
train_ft['league_winrate'] = league_winrates[:len(teams_train)]
test_ft['league_winrate'] = league_winrates[len(teams_train):]

# 특성 리스트에 league_winrate 추가
pre_game_features.append('league_winrate')

# 입력 데이터 업데이트
train_ft = train_ft[pre_game_features]
test_ft = test_ft[pre_game_features]

train_ft.shape, test_ft.shape

((9913, 79), (2324, 79))

### 각 패치 버전 사이드별 승률

In [26]:
# # 패치 버전 사이드별 승률 기록을 저장할 딕셔너리
# patch_side_records = {}
# patch_side_winrates = []

# # 날짜순으로 정렬
# combined_data = pd.concat([teams_train, teams_test], ignore_index=True)
# combined_data = combined_data.sort_values('date')

# # 패치/사이드별 승률 계산
# for idx, match in combined_data.iterrows():
#     patch = match['patch']
#     side = match['side']
#     key = (patch, side)
    
#     # 현재 시점까지의 패치/사이드별 승률 계산
#     if key not in patch_side_records:
#         patch_side_records[key] = {'wins': 0, 'total': 0}
#         patch_side_winrates.append(0.5)  # 첫 경기인 경우 0.5 반환
#     else:
#         record = patch_side_records[key]
#         patch_side_winrates.append(record['wins'] / record['total'] if record['total'] > 0 else 0.5)
    
#     # 현재 경기 결과 반영
#     result = match['result']
#     patch_side_records[key]['total'] += 1
#     if result == 1:
#         patch_side_records[key]['wins'] += 1

# # 계산된 승률을 훈련/테스트 데이터에 할당
# train_ft['patch_side_winrate'] = patch_side_winrates[:len(teams_train)]
# test_ft['patch_side_winrate'] = patch_side_winrates[len(teams_train):]

# # 특성 리스트에 patch_side_winrate 추가
# pre_game_features.append('patch_side_winrate')

# # 입력 데이터 업데이트
# train_ft = train_ft[pre_game_features]
# test_ft = test_ft[pre_game_features]

# train_ft.shape, test_ft.shape

### 픽 챔피언 지표

In [29]:
# df = teams_train.copy()
# df = df.sort_values(["teamname", "date", "gameid"])  # 시계열 정렬

# for slot in ["pick1", "pick2", "pick3", "pick4", "pick5"]:
#     # 1) 챔피언 컬럼 만들기
#     df_pick = df[["gameid", "teamname", "date", "result", slot]].copy()
#     df_pick.rename(columns={slot: "champion"}, inplace=True)

#     # 2) 챔피언을 category로 바꾸면 메모리 절약에 도움
#     df_pick["champion"] = df_pick["champion"].astype("category")

#     # 3) groupby + cumsum + shift(1)로 "직전까지" 누적
#     df_pick["pick_ind"] = 1
#     df_pick["win_ind"] = (df_pick["result"] == 1).astype(int)

#     df_pick["cum_pick_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["pick_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_count"] = (
#         df_pick.groupby(["teamname", "champion"], observed=True)["win_ind"].cumsum().shift(1)
#     )
#     df_pick["cum_win_rate"] = (
#         df_pick["cum_win_count"] / df_pick["cum_pick_count"]
#     ).fillna(0)

#     # 4) 필요한 컬럼만 남겨서, 컬럼 이름으로 바꾸기
#     df_pick = df_pick[
#         ["gameid", "teamname", "date", "champion", "cum_pick_count", "cum_win_rate"]
#     ].copy()

#     df_pick.rename(
#         columns={
#             "champion": f"{slot}_champion",  # 구분용
#             "cum_pick_count": f"{slot}_cum_pick_count",
#             "cum_win_rate": f"{slot}_cum_win_rate",
#         },
#         inplace=True,
#     )
    
#     # 5) 원본 df와 merge
#     df = pd.merge(
#         df,
#         df_pick[
#             [
#                 "gameid",
#                 "teamname",
#                 "date",
#                 f"{slot}_champion",
#                 f"{slot}_cum_pick_count",
#                 f"{slot}_cum_win_rate",
#             ]
#         ],
#         left_on=["gameid", "teamname", "date", f"{slot}"],  # 조인 키
#         right_on=["gameid", "teamname", "date", f"{slot}_champion"],
#         how="left",
#     )

# 인코딩

In [30]:
train_ft["side"] = train_ft["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
test_ft["side"] = test_ft["side"].map({"Blue": 0, "Red": 1})

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_train_ft = train_ft.copy()
cat_test_ft = test_ft.copy()

def preprocess(train_ft, test_ft):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5'] # 챔피언 레이블인코딩

    champions = pd.concat([
        train_ft[champion_columns_teams],
        test_ft[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        train_ft[col] = le.transform(train_ft[col])
        test_ft[col] = le.transform(test_ft[col])
        
    encoder = OneHotEncoder() # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(train_ft[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    train_ft = pd.concat(
        [train_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    train_ft.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(test_ft[["league"]]).toarray()
    test_ft = pd.concat(
        [test_ft, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    test_ft.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            train_ft["teamname"],
            test_ft["teamname"],
            train_ft["opp_teamname"],
            test_ft["opp_teamname"],
        ]
    )
    le_team.fit(all_team_names)

    train_ft["teamname"] = le_team.transform(train_ft["teamname"])
    train_ft["opp_teamname"] = le_team.transform(train_ft["opp_teamname"])

    test_ft["teamname"] = le_team.transform(test_ft["teamname"])
    test_ft["opp_teamname"] = le_team.transform(test_ft["opp_teamname"])
    
    return train_ft, test_ft

train_ft, test_ft = preprocess(train_ft, test_ft)

In [32]:
train_ft.select_dtypes("object").columns, test_ft.select_dtypes("object").columns

(Index(['gameid'], dtype='object'), Index(['gameid'], dtype='object'))

# 스케일링

In [33]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def scale(train_ft, test_ft):
    train_ft[train_ft.select_dtypes("number").columns] = scaler.fit_transform(
        train_ft[train_ft.select_dtypes("number").columns]
    )
    test_ft[test_ft.select_dtypes("number").columns] = scaler.transform(
        test_ft[test_ft.select_dtypes("number").columns]
    )
    return train_ft, test_ft


train_ft, test_ft = scale(train_ft, test_ft)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)

# 모델 학습 및 검증

In [34]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

cutoff_patch = cat_train_ft["patch"].quantile(0.8)
train_games = cat_train_ft[cat_train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = cat_train_ft[cat_train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = cat_train_ft[cat_train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = cat_train_ft[cat_train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

In [35]:
cat_features = list(train_x.select_dtypes("category").columns)
catboost = CatBoostClassifier(
    cat_features=cat_features,
)
catboost.fit(train_x, train_y)

pred_train = catboost.predict(train_x)
pred_valid = catboost.predict(valid_x)

print("\n훈련 정확도:", accuracy_score(train_y, pred_train))
print("검증 정확도:", accuracy_score(valid_y, pred_valid))

print(classification_report(valid_y, pred_valid))

Learning rate set to 0.024765
0:	learn: 0.6849554	total: 180ms	remaining: 3m
1:	learn: 0.6788794	total: 227ms	remaining: 1m 53s
2:	learn: 0.6716180	total: 297ms	remaining: 1m 38s
3:	learn: 0.6629636	total: 347ms	remaining: 1m 26s
4:	learn: 0.6555672	total: 386ms	remaining: 1m 16s
5:	learn: 0.6499161	total: 426ms	remaining: 1m 10s
6:	learn: 0.6443131	total: 465ms	remaining: 1m 6s
7:	learn: 0.6358465	total: 496ms	remaining: 1m 1s
8:	learn: 0.6287746	total: 532ms	remaining: 58.6s
9:	learn: 0.6257696	total: 587ms	remaining: 58.2s
10:	learn: 0.6205789	total: 622ms	remaining: 55.9s
11:	learn: 0.6181660	total: 662ms	remaining: 54.5s
12:	learn: 0.6154932	total: 704ms	remaining: 53.4s
13:	learn: 0.6130947	total: 743ms	remaining: 52.4s
14:	learn: 0.6102506	total: 783ms	remaining: 51.4s
15:	learn: 0.6073753	total: 820ms	remaining: 50.4s
16:	learn: 0.6052921	total: 857ms	remaining: 49.5s
17:	learn: 0.6018585	total: 894ms	remaining: 48.7s
18:	learn: 0.5980499	total: 932ms	remaining: 48.1s
19:	learn

In [36]:
pre_game_features.remove("league")

cutoff_patch = train_ft["patch"].quantile(0.8)
train_games = train_ft[train_ft["patch"] < cutoff_patch]["gameid"].unique()
valid_games = train_ft[train_ft["patch"] >= cutoff_patch]["gameid"].unique()

train_x = train_ft[train_ft["gameid"].isin(train_games)][pre_game_features]
valid_x = train_ft[train_ft["gameid"].isin(valid_games)][pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x.drop(columns=["gameid"], inplace=True)
valid_x.drop(columns=["gameid"], inplace=True)

In [37]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


models = [
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
    LogisticRegression(random_state=SEED),
]

for model in models:
    model.fit(train_x, train_y)
    pred_train = model.predict(train_x)
    pred_valid = model.predict(valid_x)

    print(f"\n{model.__class__.__name__} 훈련 정확도:", accuracy_score(train_y, pred_train))
    print(f"{model.__class__.__name__} 검증 정확도:", accuracy_score(valid_y, pred_valid))

    print(classification_report(valid_y, pred_valid))

[LightGBM] [Info] Number of positive: 3903, number of negative: 3896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11886
[LightGBM] [Info] Number of data points in the train set: 7799, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500449 -> initscore=0.001795
[LightGBM] [Info] Start training from score 0.001795

LGBMClassifier 훈련 정확도: 0.9626875240415438
LGBMClassifier 검증 정확도: 0.7753074739829706
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1060
           1       0.77      0.79      0.78      1054

    accuracy                           0.78      2114
   macro avg       0.78      0.78      0.78      2114
weighted avg       0.78      0.78      0.78      2114


RandomForestClassifier 훈련 정확도: 1.0
RandomForestClassifier 검증 정확도: 0.6929990539262062
            




AdaBoostClassifier 훈련 정확도: 0.7121425823823567
AdaBoostClassifier 검증 정확도: 0.727530747398297
              precision    recall  f1-score   support

           0       0.72      0.75      0.73      1060
           1       0.74      0.70      0.72      1054

    accuracy                           0.73      2114
   macro avg       0.73      0.73      0.73      2114
weighted avg       0.73      0.73      0.73      2114


SVC 훈련 정확도: 0.7461212976022567
SVC 검증 정확도: 0.6778618732261117
              precision    recall  f1-score   support

           0       0.69      0.64      0.67      1060
           1       0.66      0.72      0.69      1054

    accuracy                           0.68      2114
   macro avg       0.68      0.68      0.68      2114
weighted avg       0.68      0.68      0.68      2114


XGBClassifier 훈련 정확도: 0.9997435568662649
XGBClassifier 검증 정확도: 0.761116367076632
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1060
