## グリッドサーチの使用

### データの読み込み

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel("./2019-summer-match-data-OraclesElixir-2019-11-10.xlsx")

In [2]:
df.head()

Unnamed: 0,gameid,url,league,split,date,week,game,patchno,playerid,side,...,gdat15,xpat10,oppxpat10,xpdat10,csat10,oppcsat10,csdat10,csat15,oppcsat15,csdat15
0,1070340,https://matchhistory.euw.leagueoflegends.com/e...,WC,2019-W,43740.286146,PI-RR,1,9.19,1,Blue,...,-798,4530,5051,-521,76,90,-14,116,122,-6
1,1070340,https://matchhistory.euw.leagueoflegends.com/e...,WC,2019-W,43740.286146,PI-RR,1,9.19,2,Blue,...,-1366,3679,3928,-249,60,64,-4,88,105,-17
2,1070340,https://matchhistory.euw.leagueoflegends.com/e...,WC,2019-W,43740.286146,PI-RR,1,9.19,3,Blue,...,-629,4751,4533,218,88,85,3,131,150,-19
3,1070340,https://matchhistory.euw.leagueoflegends.com/e...,WC,2019-W,43740.286146,PI-RR,1,9.19,4,Blue,...,-1829,3526,3532,-6,87,76,11,119,134,-15
4,1070340,https://matchhistory.euw.leagueoflegends.com/e...,WC,2019-W,43740.286146,PI-RR,1,9.19,5,Blue,...,-1156,2731,2376,355,4,3,1,6,5,1


In [3]:
print(type(df))
df.shape

<class 'pandas.core.frame.DataFrame'>


(1428, 98)

### データクレンジング
- "champion", "side", "cspm", "wards", "gamelength"

In [4]:
# 欠測値の確認
df.isnull().sum()

gameid       0
url          0
league       0
split        0
date         0
            ..
oppcsat10    0
csdat10      0
csat15       0
oppcsat15    0
csdat15      0
Length: 98, dtype: int64

##### playeridが3桁の列を削除
playeridが三桁のものはチーム合計のため削除を行う

In [5]:
dropPlayer = df[ df['playerid'] >= 100 ].index
df.drop(dropPlayer, inplace=True)
df['playerid'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

### 特徴量の抽出

In [6]:
X = df[['side', 'cspm', 'wards', 'gamelength', 'teambaronkills', 'elders']]
y = df['result']

print("X is \n{}".format(X))

X is 
      side      cspm  wards  gamelength  teambaronkills  elders
0     Blue  7.085427     11   26.533333               0       0
1     Blue  4.937186     24   26.533333               0       0
2     Blue  7.236181     13   26.533333               0       0
3     Blue  9.007538      9   26.533333               0       0
4     Blue  0.527638     47   26.533333               0       0
...    ...       ...    ...         ...             ...     ...
1421   Red  8.759605     15   30.366667               0       0
1422   Red  4.544457     36   30.366667               0       0
1423   Red  7.804610     14   30.366667               0       0
1424   Red  9.121844     16   30.366667               0       0
1425   Red  2.107574     51   30.366667               0       0

[1190 rows x 6 columns]


##### one-hot エンコーディング

In [7]:
from sklearn.preprocessing import OneHotEncoder

X_ohe = pd.get_dummies(X, columns=["side"])
X = X_ohe

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
print(X_train.shape)
print(type(X_train))
print(y_train.shape)
print(type(y_train))

(833, 7)
<class 'pandas.core.frame.DataFrame'>
(833,)
<class 'pandas.core.series.Series'>


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

def build_decision_tree(X, y, depth=None):
    dt = DecisionTreeClassifier(random_state=42, max_depth=depth)
    # 交差検証スコア
    print('CV score:', cross_val_score(estimator=dt, X=X, y=y).mean())
    # 決定木予測モデルを構築
    dt.fit(X, y)
    return dt

In [11]:
dt = build_decision_tree(X=X_train, y=y_train, depth=10)

CV score: 0.8811413317942428


##### GridSearchCV及びRandomizedSearchCV

In [12]:
import warnings
warnings.filterwarnings('ignore')

import scipy.stats
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


# グリッドサーチ用にモデルとパラメーターセットをまとめた辞書を用意
# 辞書のkeyにはオブジェクトのインスタンスを指定することができます
model_param_set_grid = {
    LogisticRegression(): {
        "C": [10 ** i for i in range(-5, 5)],
        "random_state": [42]
    },
    LinearSVC(): {
        "C": [10 ** i for i in range(-5, 5)],
        "multi_class": ["ovr", "crammer_singer"],
        "random_state": [42],
        "max_iter": [1000]
    },
    SVC(): {
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
        "C": [10 ** i for i in range(-5, 5)],
        "decision_function_shape": ["ovr", "ovo"],
        "random_state": [42],
        "max_iter": [1000]
    },
    DecisionTreeClassifier(): {
        "max_depth": [i for i in range(1, 20)],
    },
    RandomForestClassifier(): {
        "n_estimators": [i for i in range(10, 20)],
        "max_depth": [i for i in range(1, 10)],
    },
    KNeighborsClassifier(): {
        "n_neighbors": [i for i in range(1, 10)]
    }
}

# ランダムサーチ用にモデルとパラメーターセットをまとめた辞書を用意
model_param_set_random = {
    LogisticRegression(): {
        "C": scipy.stats.uniform(0.00001, 1000),
        "random_state": scipy.stats.randint(0, 100)
    },
    LinearSVC(): {
        "C": scipy.stats.uniform(0.00001, 1000),
        "multi_class": ["ovr", "crammer_singer"],
        "random_state": scipy.stats.randint(0, 100),
        "max_iter": [1000]
    },
    SVC(): {
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
        "C": scipy.stats.uniform(0.00001, 1000),
        "decision_function_shape": ["ovr", "ovo"],
        "random_state": scipy.stats.randint(0, 100),
        "max_iter": [1000]
    },
    DecisionTreeClassifier(): {
        "max_depth": scipy.stats.randint(1, 20),
    },
    RandomForestClassifier(): {
        "n_estimators": scipy.stats.randint(10, 100),
        "max_depth": scipy.stats.randint(1, 20),
    },
    KNeighborsClassifier(): {
        "n_neighbors": scipy.stats.randint(1, 20)
    }
}

# スコア比較用に変数を用意
max_score = 0
best_model = None
best_param = None

# グリッドサーチでパラメーターサーチ
for model, param in model_param_set_grid.items():
    clf = GridSearchCV(model, param)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average="micro")
    # 最高評価更新時にモデルやパラメーターも更新
    if max_score < score:
        max_score = score
        best_model = model.__class__.__name__
        best_param = clf.best_params_

# ランダムサーチでパラメーターサーチ
for model, param in model_param_set_random.items():
    clf = RandomizedSearchCV(model, param)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average="micro")
    # 最高評価更新時にモデルやパラメーターも更新
    if max_score < score:
        max_score = score
        best_model = model.__class__.__name__
        best_param = clf.best_params_
        
print("学習モデル:{},\nパラメーター:{}".format(best_model, best_param))
# 最も成績のいいスコアを出力してください。
print("ベストスコア:",max_score)

学習モデル:LinearSVC,
パラメーター:{'C': 465.3870932107256, 'max_iter': 1000, 'multi_class': 'ovr', 'random_state': 30}
ベストスコア: 0.9047619047619048
