In [None]:
!pip install ace_tools_open


In [None]:
!pip list



In [None]:
!pip install pytorch_tabnet

In [None]:
!pip install torch==2.5.1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from itertools import product
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch


# 데이터 불러오기
data = pd.read_csv("/content/drive/MyDrive/train_val.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

# features =  ['away_prob_5', 'HTAG_5', 'B365H', 'PSH', 'HSRA', 'ASRA']
# features =  ['B365H', 'PSH', 'HSRA', 'ASRA']
# features =  ['away_prob_5', 'HTAG_5', 'B365H', 'HSRA', 'ASRA']
features =  ['home_prob_5','away_prob_5', 'xg_home_5','xg_away_5',  'HTHG_5', 'HTAG_5', 'HST_5','AST_5',
             'elo_home','elo_away','home_advantage','HPA','APA','HSA','ASA', 'B365H', 'B365D', 'B365A','PSH', 'PSD', 'PSA','HSRA', 'ASRA'] #전체 변수
# features =  ['away_prob_5', 'elo_home', 'HSA', 'B365H', 'PSD', 'B365D']

target = 'result'

X = data[features].values # train data
y = data[target].replace({'home':0,'away':1,'draw':2}).values

X_t = test[features].values #test data
y_t = test[target].replace({'home':0,'away':1,'draw':2}).values

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# TabNet 하이퍼파라미터 후보
n_d_list = [4] #4
n_a_list = [4] #4
n_steps_list = [2]#2
gamma_list = [0.9] #0.9
lr_list = [0.04] #0.04
batch_size_list = [32] #32

results = []

# 가능한 모든 조합
param_combinations = list(product(n_d_list, n_a_list, n_steps_list, gamma_list, lr_list, batch_size_list))

for n_d, n_a, n_steps, gamma, lr, batch_size in param_combinations:
    try:
        #pretrain
        pretrainer = TabNetPretrainer(
            n_d=n_d,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            mask_type='sparsemax',
            optimizer_params={'lr': lr},
            verbose=0,
            seed=42
        )

        pretrainer.fit(
            X_train=X,
            eval_set=[X],
            max_epochs=100,
            batch_size=batch_size,
            virtual_batch_size=16,
            patience=10,
            num_workers=0,
            drop_last=False
        )

        acc_scores = []
        bal_acc_scores = []
        f1_scores = []

        for train_idx, test_idx in kf.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model = TabNetClassifier(
              n_d=n_d,
              n_a=n_a,
              n_steps=n_steps,
              gamma=gamma,
              #optimizer_fn=None,
              optimizer_params={"lr": lr},
              mask_type='sparsemax',
              # scheduler_params={"step_size":20, "gamma":0.9},
              # scheduler_fn=torch.optim.lr_scheduler.StepLR,
              verbose=0,
              seed=42
          )
            # 학습
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                eval_name=['valid'],
                eval_metric=['accuracy'],
                max_epochs=100,
                patience=10,
                batch_size=batch_size,
                virtual_batch_size=16,
                num_workers=0,
                drop_last=False,
                from_unsupervised=pretrainer #pretrain load
            )

            # 예측
            model.network.eval()
            y_pred = model.predict(X_test)

            acc_scores.append(accuracy_score(y_test, y_pred))
            bal_acc_scores.append(balanced_accuracy_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

        # 교차검증 평균
        avg_acc = np.mean(acc_scores)
        avg_bal = np.mean(bal_acc_scores)
        avg_f1 = np.mean(f1_scores)

        model_t = TabNetClassifier(
              n_d=n_d,
              n_a=n_a,
              n_steps=n_steps,
              gamma=gamma,
              #optimizer_fn=None,
              optimizer_params={"lr": lr},
              mask_type='sparsemax',
              # scheduler_params={"step_size":20, "gamma":0.9},
              # scheduler_fn=torch.optim.lr_scheduler.StepLR,
              verbose=0,
              seed=42
        )


        # 전체 데이터 학습
        model_t.fit(
            X, y,
            max_epochs=100,
            patience=10,
            batch_size=batch_size,
            virtual_batch_size=16,
            num_workers=0,
            drop_last=False,
            from_unsupervised=pretrainer  #pretrain load
        )

        # 테스트 예측
        model_t.network.eval()
        y_pred_t = model_t.predict(X_t)

        acc_t = accuracy_score(y_t, y_pred_t)
        bal_t = balanced_accuracy_score(y_t, y_pred_t)
        f1_t = f1_score(y_t, y_pred_t, average='weighted')

        results.append((n_d, n_a, n_steps, gamma, lr, batch_size, avg_acc, avg_bal, avg_f1, acc_t, bal_t, f1_t))

    except Exception as e:
        print(f"Error with params {n_d, n_a, n_steps, gamma, lr, batch_size} due to {e}")

# 결과 데이터프레임
results_df = pd.DataFrame(
    results,
    columns=['n_d','n_a','n_steps','gamma','lr','batch_size',
             'avg_acc','avg_bal','avg_f1','acc_t','bal_t','f1_t']
)

# 정렬
results_df = results_df.sort_values(by='acc_t', ascending=False)

# 출력
import ace_tools_open as tools
tools.display_dataframe_to_user(name='TabNet Hyperparameter Tuning Results', dataframe=results_df)

# 튜닝 해서 최적 파라미터 찾은 후에 주석 해제-> 변수 중요도 출력
# feature_importance = pd.DataFrame({
#     "feature": features,
#     "importance": model_t.feature_importances_
# }).sort_values(by="importance", ascending=False)

# print(feature_importance)

# clf: TabNetClassifier 학습 완료 상태
# torch.save(model_t.network.state_dict(), "/content/drive/MyDrive/tabnet_model_5759.pth")