In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

DATA_PATH = "/content/drive/MyDrive/Datathon_leeya/data/"
SEED = 42



In [3]:
teams_train = pd.read_csv(f"{DATA_PATH}train_FTdata.csv")
teams_test = pd.read_csv(f"{DATA_PATH}test_FTdata.csv")
teams_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
teams_test_target = pd.read_csv(f"{DATA_PATH}teams_test_target.csv")

In [4]:
teams_train.shape, teams_train_target.shape, teams_test.shape, teams_test_target.shape

((9913, 87), (9913, 3), (2324, 87), (2324, 3))

In [5]:
teams_train.head(50)

Unnamed: 0,gameid,patch,side,league,teamname,opp_teamname,ban1,ban2,ban3,ban4,...,recent10_wcpm,opp_recent10_wcpm,recent10_controlwardsbought,opp_recent10_controlwardsbought,recent10_visionscore,opp_recent10_visionscore,recent10_vspm,opp_recent10_vspm,h2h_winrate,league_winrate
0,ESPORTSTMNT01_2700815,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Twisted Fate,LeBlanc,...,0.0,1.3166,0.0,30.0,0.0,207.0,0.0,7.7868,0.5,0.5
1,ESPORTSTMNT01_2700815,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Leona,Jayce,...,1.3166,1.464567,30.0,40.0,207.0,250.333333,7.7868,8.068033,1.0,0.5
2,ESPORTSTMNT01_2690695,12.01,Blue,LCK,DRX,BNK FEARX,Diana,Caitlyn,Yuumi,Samira,...,1.40225,0.0,38.5,0.0,242.5,0.0,8.11405,0.0,0.0,0.0
3,ESPORTSTMNT01_2690695,12.01,Red,LCK,BNK FEARX,DRX,Renekton,Lee Sin,Twisted Fate,Viktor,...,1.464567,1.40225,40.0,38.5,250.333333,242.5,8.068033,8.11405,1.0,1.0
4,ESPORTSTMNT01_2690705,12.01,Blue,LCK,T1,Kwangdong Freecs,Lee Sin,Ryze,Viktor,LeBlanc,...,1.61275,1.3166,40.75,30.0,247.0,207.0,8.04935,7.7868,0.5,0.5
5,ESPORTSTMNT01_2690705,12.01,Red,LCK,Kwangdong Freecs,T1,Renekton,Twisted Fate,Vex,Jayce,...,1.61042,1.464567,42.6,40.0,255.8,250.333333,8.01886,8.068033,0.0,0.5
6,ESPORTSTMNT01_2690725,12.01,Blue,LCK,Kwangdong Freecs,T1,Irelia,Twisted Fate,Ezreal,Zoe,...,1.5014,0.0,42.166667,0.0,238.333333,0.0,7.573767,0.0,0.0,0.0
7,ESPORTSTMNT01_2690725,12.01,Red,LCK,T1,Kwangdong Freecs,Renekton,Ryze,Lux,LeBlanc,...,1.500129,1.40225,43.857143,38.5,252.142857,242.5,7.682271,8.11405,1.0,1.0
8,ESPORTSTMNT01_2701248,12.01,Blue,LCK,Nongshim RedForce,Hanwha Life Esports,Yuumi,Karma,Caitlyn,LeBlanc,...,1.471425,1.61042,43.5,42.6,250.25,255.8,7.662962,8.01886,0.5,0.5
9,ESPORTSTMNT01_2701248,12.01,Red,LCK,Hanwha Life Esports,Nongshim RedForce,Twisted Fate,Renekton,Aphelios,Lulu,...,1.552344,1.5014,44.333333,42.166667,256.777778,238.333333,7.805256,7.573767,0.0,0.5


In [6]:
teams_train["side"] = teams_train["side"].map({"Blue": 0, "Red": 1}) # 진영 인코딩
teams_test["side"] = teams_test["side"].map({"Blue": 0, "Red": 1})

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_train_ft = teams_train.copy()
cat_test_ft = teams_test.copy()

def preprocess(teams_train, teams_test):
    champion_columns_teams = ['ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'pick1', 'pick2', 'pick3', 'pick4', 'pick5']  # 챔피언 레이블인코딩

    champions = pd.concat([
        teams_train[champion_columns_teams],
        teams_test[champion_columns_teams],
    ]).stack().unique()

    champions_df = pd.DataFrame({'champion': champions})
    champions_df = champions_df.dropna().reset_index(drop=True)

    le = LabelEncoder()
    champions_df['champion_encoded'] = le.fit_transform(champions_df['champion'])

    for col in champion_columns_teams:
        teams_train[col] = le.transform(teams_train[col])
        teams_test[col] = le.transform(teams_test[col])

    encoder = OneHotEncoder()  # 리그 원핫인코딩
    league_encoded = encoder.fit_transform(teams_train[["league"]]).toarray()
    league_cols = [f"league_{col}" for col in encoder.categories_[0]]
    teams_train = pd.concat(
        [teams_train, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    teams_train.drop("league", axis=1, inplace=True)

    league_encoded = encoder.transform(teams_test[["league"]]).toarray()
    teams_test = pd.concat(
        [teams_test, pd.DataFrame(league_encoded, columns=league_cols)], axis=1
    )
    teams_test.drop("league", axis=1, inplace=True)

    le_team = LabelEncoder()
    all_team_names = pd.concat(
        [
            teams_train["teamname"],
            teams_test["teamname"],
            teams_train["opp_teamname"],
            teams_test["opp_teamname"]
        ]
    )
    le_team.fit(all_team_names)

    teams_train["teamname"] = le_team.transform(teams_train["teamname"])
    teams_train["opp_teamname"] = le_team.transform(teams_train["opp_teamname"])

    teams_test["teamname"] = le_team.transform(teams_test["teamname"])
    teams_test["opp_teamname"] = le_team.transform(teams_test["opp_teamname"])

    return teams_train, teams_test

teams_train, teams_test = preprocess(teams_train, teams_test)


In [8]:
teams_train.select_dtypes("object").columns, teams_test.select_dtypes("object").columns

(Index(['gameid', 'time'], dtype='object'),
 Index(['gameid', 'time'], dtype='object'))

In [9]:
teams_train['time'] = pd.to_datetime(teams_train['time']).astype(int) / 10**9
teams_test['time'] = pd.to_datetime(teams_test['time']).astype(int) / 10**9

  teams_train['time'] = pd.to_datetime(teams_train['time']).astype(int) / 10**9
  teams_test['time'] = pd.to_datetime(teams_test['time']).astype(int) / 10**9


In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def scale(train_ft, test_ft):
    numeric_cols = train_ft.select_dtypes(include="number").columns
    train_ft[numeric_cols] = scaler.fit_transform(train_ft[numeric_cols])
    test_ft[numeric_cols] = scaler.transform(test_ft[numeric_cols])
    return train_ft, test_ft

teams_train, teams_test = scale(teams_train, teams_test)
cat_train_ft, cat_test_ft = scale(cat_train_ft, cat_test_ft)


In [11]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [12]:
import optuna
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

class HyperparameterTuner:
    def __init__(self, model, params, train, target, cat_features=None):
        self.model = model
        self.params = params
        self.train = train
        self.target = target
        self.cat_features = cat_features
        self.cv = TimeSeriesSplit(n_splits=5)
        self.study = optuna.create_study(direction="maximize")

    def objective(self, trial):
        params = {}

        for param_name, param_range in self.params.items():
            if param_range["type"] == "int":
                params[param_name] = trial.suggest_int(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "float":
                params[param_name] = trial.suggest_float(
                    param_name, param_range["min"], param_range["max"]
                )
            elif param_range["type"] == "categorical":
                params[param_name] = trial.suggest_categorical(
                    param_name, param_range["values"]
                )
        if self.model == CatBoostClassifier:
            model = self.model(**params, cat_features=self.cat_features, logging_level="Silent")
        else:
            model = self.model(**params)

        model.fit(self.train, self.target)

        scores = cross_val_score(
            model, self.train, self.target, cv=self.cv, scoring="accuracy", n_jobs=-1
        ).mean()
        return scores

    def optimize(self, n_trials):
        self.study.optimize(self.objective, n_trials=n_trials)

    def best_params(self):
        return self.study.best_params

    def best_score(self):
        return self.study.best_value

In [13]:
# pre_game_features.remove("league")

cutoff_patch = teams_train["patch"].quantile(0.8)
train_games = teams_train[teams_train["patch"] < cutoff_patch]["gameid"].unique()
valid_games = teams_train[teams_train["patch"] >= cutoff_patch]["gameid"].unique()

train_x = teams_train[teams_train["gameid"].isin(train_games)]#[pre_game_features]
valid_x = teams_train[teams_train["gameid"].isin(valid_games)]#[pre_game_features]

train_y = teams_train_target[teams_train_target["gameid"].isin(train_games)]["result"]
valid_y = teams_train_target[teams_train_target["gameid"].isin(valid_games)]["result"]

train_x = train_x.drop(columns=["gameid"], errors="ignore")
valid_x = valid_x.drop(columns=["gameid"], errors="ignore")


In [14]:
%pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [15]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    AdaBoostClassifier,
    HistGradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

tscv = TimeSeriesSplit(5)

for model in models:
    try:
        scores = cross_val_score(
            model, train_x, train_y, cv=tscv, scoring="accuracy", n_jobs=-1
        )
        print(f"{model.__class__.__name__} : {np.mean(scores)}, {np.std(scores)}")
    except Exception as e:
        print(f"Error with {model.__class__.__name__}: {e}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



LogisticRegression : 0.6788298691301001, 0.024866115144846726
LGBMClassifier : 0.7288683602771362, 0.05079524874586115
RandomForestClassifier : 0.6729792147806004, 0.0353763586861277
HistGradientBoostingClassifier : 0.7324095458044649, 0.051555382017624135
AdaBoostClassifier : 0.6792917628945342, 0.030648622657412368
SVC : 0.6762124711316397, 0.03237505913669147
Error with XGBClassifier: 'super' object has no attribute '__sklearn_tags__'
CatBoostClassifier : 0.722247882986913, 0.03889975573308902


In [18]:
from sklearn.metrics import classification_report

# 모델 리스트
models = [
    LogisticRegression(random_state=SEED),
    LGBMClassifier(random_state=SEED, n_jobs=-1),
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=SEED),
    AdaBoostClassifier(random_state=SEED),
    SVC(random_state=SEED, probability=True),
    XGBClassifier(random_state=SEED, n_jobs=-1),
    CatBoostClassifier(random_state=SEED, verbose=0),
]

# 결과 출력
for model in models:
    model_name = model.__class__.__name__  # 모델 이름 가져오기
    model.fit(train_x, train_y)  # 학습
    valid_pred = model.predict(valid_x)  # 검증 데이터 예측

    print(f"\n=== {model_name} ===")
    print(classification_report(valid_y, valid_pred))



=== LogisticRegression ===
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      1060
           1       0.70      0.75      0.72      1054

    accuracy                           0.72      2114
   macro avg       0.72      0.72      0.72      2114
weighted avg       0.72      0.72      0.72      2114

[LightGBM] [Info] Number of positive: 3903, number of negative: 3896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12339
[LightGBM] [Info] Number of data points in the train set: 7799, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500449 -> initscore=0.001795
[LightGBM] [Info] Start training from score 0.001795

=== LGBMClassifier ===
              precision    recall  f1-score   support

           0       0.83      0.78      0.80      1060
           1       0.7

In [19]:
from sklearn.metrics import accuracy_score
results = {}

# 모델 점수 계산
for model in models:
    model_name = model.__class__.__name__  # 모델 이름 가져오기
    model.fit(train_x, train_y)  # 학습
    valid_pred = model.predict(valid_x)  # 검증 데이터 예측
    score = accuracy_score(valid_y, valid_pred)  # 정확도 계산
    results[model_name] = score  # 결과 저장

# 결과 출력
print("Validation Accuracy Scores:")
for name, score in results.items():
    print(f"{name}: {score:.4f}")

[LightGBM] [Info] Number of positive: 3903, number of negative: 3896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12339
[LightGBM] [Info] Number of data points in the train set: 7799, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500449 -> initscore=0.001795
[LightGBM] [Info] Start training from score 0.001795
Validation Accuracy Scores:
LogisticRegression: 0.7167
LGBMClassifier: 0.8098
RandomForestClassifier: 0.7204
HistGradientBoostingClassifier: 0.8098
AdaBoostClassifier: 0.7540
SVC: 0.7114
XGBClassifier: 0.7952
CatBoostClassifier: 0.8032


In [17]:
# from catboost import CatBoostClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# # CatBoostClassifier 모델 정의
# model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, logging_level="Silent")

# # 모델 학습
# model.fit(train_x, train_y)

# # Train 데이터 예측
# train_pred = model.predict(train_x)
# train_accuracy = accuracy_score(train_y, train_pred)
# train_precision = precision_score(train_y, train_pred)
# train_recall = recall_score(train_y, train_pred)
# train_f1 = f1_score(train_y, train_pred)
# train_roc_auc = roc_auc_score(train_y, model.predict_proba(train_x)[:, 1])

# print("Train Performance:")
# print(f"Accuracy: {train_accuracy}")
# print(f"Precision: {train_precision}")
# print(f"Recall: {train_recall}")
# print(f"F1 Score: {train_f1}")
# print(f"ROC-AUC: {train_roc_auc}")

# # Validation 데이터 예측
# valid_pred = model.predict(valid_x)
# valid_accuracy = accuracy_score(valid_y, valid_pred)
# valid_precision = precision_score(valid_y, valid_pred)
# valid_recall = recall_score(valid_y, valid_pred)
# valid_f1 = f1_score(valid_y, valid_pred)
# valid_roc_auc = roc_auc_score(valid_y, model.predict_proba(valid_x)[:, 1])

# print("\nValidation Performance:")
# print(f"Accuracy: {valid_accuracy}")
# print(f"Precision: {valid_precision}")
# print(f"Recall: {valid_recall}")
# print(f"F1 Score: {valid_f1}")
# print(f"ROC-AUC: {valid_roc_auc}")
