In [None]:
from models import BootstrapModel
import pandas as pd
import numpy as np
import os

import my_functions as mf
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from skopt.space import Real, Categorical, Integer

from xgboost import XGBClassifier

import joblib

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [None]:
y_train = train["ProdTaken"].astype(float)

x_train = train.drop(columns=["ProdTaken"], axis=1)

In [None]:
# 前処理

# train_test_split は BootstrapModelクラス内で実行されるからいらない。
# スケーラーもいらない

In [None]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
test = test.to_numpy()

# 推論

* モデルの名前を変更する！ \
* しっかりと .test_pred の第二引数を設定する。（.val_pred の引数と一緒でいい）

In [None]:
svm = BootstrapModel(x_train, y_train, StandardScaler(), SVC(kernel='rbf', probability=True), roc_auc_score)

model_save_path = '../model_save/.pkl'
svm.bstrap_models_dict = joblib.load(model_save_path)

svm.test_pred(test, 10, True)

In [None]:
logistic = BootstrapModel(x_train, y_train, StandardScaler(), LogisticRegression(solver='liblinear'), roc_auc_score, random_state=42)

model_save_path = '../model_save/.pkl'
logistic.bstrap_models_dict = joblib.load(model_save_path)

logistic.test_pred(test, 10, True)

In [None]:
knn = BootstrapModel(x_train, y_train, StandardScaler(), KNeighborsClassifier(), roc_auc_score, random_state=42)

model_save_path = '../model_save/.pkl'
knn.bstrap_models_dict = joblib.load(model_save_path)

knn.test_pred(test, 10, True)

In [None]:
xgb = BootstrapModel(x_train, y_train, StandardScaler(), XGBClassifier(), roc_auc_score, random_state=42)

model_save_path = '../model_save/.pkl'
xgb.bstrap_models_dict = joblib.load(model_save_path)

xgb.test_pred(test, 10, True)

# アンサンブル

最後に各モデルの推論値の平均をとって出力する

In [None]:
# ディレクトリ内のすべてのCSVファイルを読み込む
submission_dir = '../submissions'
csv_files = [f for f in os.listdir(submission_dir) if f.endswith('.csv')]

# 各CSVファイルをデータフレームとして読み込み、リストに格納する
dataframes = []
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(submission_dir, csv_file), header=None)
    dataframes.append(df)

# すべてのデータフレームを足し合わせる
sum_df = sum(dataframes)

# 平均を計算
ensemble_df = sum_df / len(dataframes)

# 結果をensemble_pred.csvとして保存
output_path = os.path.join(submission_dir, 'ensemble_pred.csv')
ensemble_df.to_csv(output_path, index=False, header=False)

print(f"Ensemble predictions saved to {output_path}")