参考：https://www.salesanalytics.co.jp/datascience/datascience158/

# ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# pipeライン構築
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

# 変換器
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer
import xgboost as xgb

# データの読み込み・確認

In [2]:
# データセットの読み込み
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
df = pd.read_csv(url, index_col='datetime', parse_dates=True)
# 特徴量（説明変数）
X = df.drop(['casual','registered','count'],axis=1)
# 目的変数
y = df['casual']

In [3]:
y.head()

datetime
2011-01-01 00:00:00    3
2011-01-01 01:00:00    8
2011-01-01 02:00:00    5
2011-01-01 03:00:00    3
2011-01-01 04:00:00    0
Name: casual, dtype: int64

In [4]:
X.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0


In [5]:
# 量的変数
nums = ['temp','atemp','humidity','windspeed']
# 質的変数
cats = ['season','holiday','workingday','weather']

In [6]:
X[nums].head()

Unnamed: 0_level_0,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,9.84,14.395,81,0.0
2011-01-01 01:00:00,9.02,13.635,80,0.0
2011-01-01 02:00:00,9.02,13.635,80,0.0
2011-01-01 03:00:00,9.84,14.395,75,0.0
2011-01-01 04:00:00,9.84,14.395,75,0.0


In [7]:
X[cats].head()

Unnamed: 0_level_0,season,holiday,workingday,weather
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,1,0,0,1
2011-01-01 01:00:00,1,0,0,1
2011-01-01 02:00:00,1,0,0,1
2011-01-01 03:00:00,1,0,0,1
2011-01-01 04:00:00,1,0,0,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.3,
  random_state=42
)

# パイプラインの構築

## 量的変数のみ

パイプライン構築方法
- make_pipelineで構築  
  名前なし  
- Pipelineで構築  
  名前あり

### make_pipelineで構築

In [9]:
# パイプラインの定義
num_pipeline = make_pipeline(
  SimpleImputer(strategy="mean"), # 平均値で欠損値補完
  StandardScaler(), #正規化
  xgb.XGBRegressor(), # XGBoost
)

In [12]:
# パイプラインの学習
num_pipeline.fit(X_train[nums], y_train) 

In [14]:
# 目的変数yの予測
pred_y = num_pipeline.predict(X_test[nums])

# R2(決定係数)
r2_score(y_test, pred_y)

0.29607110087009625

### Pipelineで構築

In [16]:
num_pipeline = Pipeline(
  steps = [
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", StandardScaler()),
    ("regressor", xgb.XGBRegressor()),
  ]
)

In [17]:
# パイプラインの学習
num_pipeline.fit(X_train[nums], y_train)

In [18]:
# 目的の変数yの予測
pred_y = num_pipeline.predict(X_test[nums])

# R2(決定係数)
r2_score(y_test, pred_y)

0.29607110087009625

## 質的変数のみ

## make_pipelineで構築

In [22]:
# パイプラインの定義
cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"), # 最頻値で欠損値補完
  OneHotEncoder(handle_unknown='ignore', sparse_output=False), # ダミーコード化
  xgb.XGBRegressor(), # SGBoost
)

In [23]:
# パイプラインの学習
cat_pipeline.fit(X_train[cats], y_train)

In [24]:
# 目的変数yの予測
pred_y = cat_pipeline.predict(X_test[cats])

# R2(決定係数)
r2_score(y_test, pred_y)

0.21983990337434034

### Pipelineで構築

In [25]:
# パイプラインの定義
cat_pipeline = Pipeline(
  steps = [
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ("regressor", xgb.XGBRegressor()),
  ]
)

In [26]:
# パイプラインの学習
cat_pipeline.fit(X_train[cats], y_train)

In [27]:
# 目的変数y脳予測
pred_y = cat_pipeline.predict(X_test[cats])

# R2(決定係数)
r2_score(y_test, pred_y)

0.21983990337434034

## 量的変数と質的変数が混合

### 変換器
ここの変換器を定義し個々の変換器をColumnTransformerで連結し全体の変換器を完成させる

In [28]:
# 量的変換用の変換器パイプラインの定義
num_pipeline = Pipeline(
  steps = [
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", StandardScaler()),
  ]
)

# 量的変数用の変換器のパイプラインの定義
cat_pipeline = Pipeline(
  steps = [
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
  ]
)

In [30]:
# 変換器のパイプラインを定義
trans = ColumnTransformer(
  transformers=[
    ("num_trans", num_pipeline, nums),
    ("cat_trans", cat_pipeline, cats),
  ],
  # 処理の対象外の奥町領に対する処理
  # drop: その変数を削除
  # passthrough: その変数をそのまま出力
  remainder = "drop"
)

In [33]:
# 変換器を使用
X_transformed = trans.fit_transform(X_train)

print(X_transformed)

[[-1.43568905 -1.71483781 -1.55587057 ...  0.          0.
   0.        ]
 [ 0.24126905  0.24576744  0.3177967  ...  0.          0.
   0.        ]
 [ 0.76531845  0.6021342   1.0984914  ...  0.          0.
   0.        ]
 ...
 [-1.12125941 -1.18028767 -0.72312956 ...  0.          0.
   0.        ]
 [-0.59721    -0.46755414 -2.33656527 ...  0.          0.
   0.        ]
 [ 0.24126905  0.24576744  1.35872296 ...  1.          0.
   0.        ]]


### 推定器と連結

In [34]:
full_pipeline = Pipeline(
  steps = [
    ("preprocessor", trans),
    ("regressor", xgb.XGBRegressor())
  ]
)

In [35]:
# パイプラインの学習
full_pipeline.fit(X_train, y_train)

In [36]:
# 目的変数yの予測
pred_y = full_pipeline.predict(X_test)

# R2(決定係数)
r2_score(y_test, pred_y)

0.6229186379215021

## ハイパラチューニング

パイプラインないの変換器や推定器の名前を、探索するハイパーパラメータの頭につける

In [37]:
# 探索範囲
params = {
    'regressor__max_depth':[2, 4, 6, 8, 10],
    'regressor__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90],
    'regressor__min_child_weight':[1, 2, 4, 6, 8, 10],
}

In [38]:
# インスタンス生成
gs = GridSearchCV(
  full_pipeline,
  params,
  cv=10,
  n_jobs=-1
)

# グリッドサーチの実施
gs.fit(X_train, y_train)

In [39]:
print(gs.best_params_)

{'regressor__max_depth': 6, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 30}


In [40]:
# 最適なモデル
best_pipeline = gs.best_estimator_

# 目的変数yの予測
pred_y = best_pipeline.predict(X_test)

# R2(決定係数)
r2_score(y_test, pred_y)

0.6291158513235733