In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from joblib import dump, load

# データの読み込み
train_data = pd.read_csv('../data_processed/train_data_processed.csv')
test_data = pd.read_csv('../data_processed/test_data_processed.csv')
submit_data = pd.read_csv('../data/test.csv')

In [2]:
X = train_data.drop(['attendance'], axis=1)
y = train_data['attendance']

# 学習データと評価データに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# パラメータの候補範囲を設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# グリッドサーチを実行
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_root_mean_squared_error',
                           cv=5)

# モデルの学習とハイパーパラメータの探索
grid_search.fit(X_train, y_train)

# 最適なハイパーパラメータの組み合わせ
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# 最適なモデルを取得
best_model = grid_search.best_estimator_

# 評価データで予測
predictions = best_model.predict(X_test)

# RMSEを計算
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

# 評価用データで予測
test_predictions = best_model.predict(test_data)

# 提出用データフレームを作成
submission = pd.DataFrame({
    'id': submit_data['id'],
    'attendance': test_predictions
})

# ヘッダーを含まずにCSVファイルとして保存（'Deloitte Analytics/submission'フォルダに保存）
submission.to_csv('../submission/6random_forest_regression.csv', index=False, header=False)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RMSE: 4570.785193816896


In [8]:
# モデルを保存
dump(best_model, '../predict/model.joblib')

['../predict/model.joblib']