In [8]:
# データ処理と可視化のライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 機械学習のライブラリ
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# LightGBMとOptunaのライブラリ
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from optuna.integration import LightGBMPruningCallback
import optuna
import optuna.visualization as vis

# モデルの保存とSHAP値の計算のライブラリ
from joblib import dump
from joblib import load
from joblib import dump
import shap

In [12]:
# データの読み込み
train_data = pd.read_csv('../data_processed/train_data_processed.csv')
test_data = pd.read_csv('../data_processed/test_data_processed.csv')
submit_data = pd.read_csv('../data/test.csv')

X = train_data.drop(['attendance'], axis=1)
y = train_data['attendance']

# 学習データと評価データに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# モデルを読み込む
XGBoost = xgb.Booster()
XGBoost.load_model('../models/XGBoost(optuna)')

CatBoost = CatBoostRegressor()
CatBoost.load_model('../models/CatBoost(optuna)')

LightGBM = load('../models/LightGBM(optuna)')

In [15]:
# 評価データで予測
prediction1 = CatBoost.predict(X_test)
prediction2 = LightGBM.predict(X_test)

# データをDMatrixに変換
dtest = xgb.DMatrix(X_test)
prediction3 = XGBoost.predict(dtest)

# 3つの予測の平均を計算
predictions = (prediction1 + prediction2 + prediction3) / 3

# RMSEを計算
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

# 評価用データで予測
test_predictions1 = CatBoost.predict(test_data)
test_predictions2 = LightGBM.predict(test_data)

# データをDMatrixに変換
dtest_data = xgb.DMatrix(test_data)
test_predictions3 = XGBoost.predict(dtest_data)

# 3つの予測の平均を計算
test_predictions = (test_predictions1 + test_predictions2 + test_predictions3) / 3

# 提出用データフレームを作成
submission = pd.DataFrame({
    'id': submit_data['id'],
    'attendance': test_predictions
})

# ヘッダーを含まずにCSVファイルとして保存（'Deloitte Analytics/submission'フォルダに保存）
submission.to_csv('../submission/14ensemble.csv', index=False, header=False)



RMSE: 932.2617252711383
