In [55]:
import pandas as pd
import scipy as sc
import numpy as np
import sklearn
import pickle
import pathlib as Path
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

import multiprocessing
import itertools
import collections
import datetime
import gc

from tqdm._tqdm_notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 50)

data_path = Path.Path("../data")
result_path = Path.Path("../result")

prefix = 'ana301'

# データ抽出
> イニング結果単位：ana202_all_FTR_dup_df.pkl <br>
> 全体データ：ana201_all_FTR.pkl <br>

In [56]:
# イニング結果
all_dup_df = pd.read_pickle(data_path / "ana202_all_FTR_dup_df.pkl")

# 全体データ
all_df = pd.read_pickle(data_path / "ana201_all_FTR_df.pkl")

# 訓練データ抽出
> ・イニング結果データ（all_dup_df）<br>

In [57]:
train_df = all_dup_df[all_dup_df['flg_train'] == 1]
train_df.shape

(4366, 77)

# 　

# モデル構築
> ・目的変数：is_hit0 <br>
> ・説明変数：FTR_: <br>
> ・モデル <br>
>　　・ロジスティック <br>
>　　・LGBM <br>
> ・チューニング：CV

## 下準備

In [58]:
# データの分割

# 目的変数
y = train_df['is_hit0']

# 説明変数
X = train_df.filter(like='FTR_', axis=1)

In [59]:
# LightGBMのインストール
# ! pip install lightgbm

In [60]:
# ロジスティック回帰（分類ではない = 出力は確率）
from sklearn.linear_model import LogisticRegression

# LGBM
import lightgbm as lgbm

In [61]:
# チューニング
# CV
from sklearn.model_selection import GridSearchCV

In [62]:
LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
# Lightgbm
lgbm.LGBMClassifier()

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## モデル推定

### ロジスティック回帰モデル

In [64]:
# ハイパラ設定
pram_logit = {
    
    'penalty':['l1', 'l2'],
    'C':[10**i for i in range(-2, 3)],
    'max_iter':[10000],
    'random_state':[123]
}

In [65]:
# ロジスティック回帰
model_logit = GridSearchCV(
            LogisticRegression(),
            pram_logit,
            cv=20
)

In [66]:
model_logit.fit(X, y)













GridSearchCV(cv=20, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'max_iter': [10000],
                         'penalty': ['l1', 'l2'], 'random_state': [123]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [67]:
model_logit.best_estimator_

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
# 精度の確認
model_logit.best_estimator_.score(X, y)

0.951442968392121

### LIGHTgbm

In [69]:
# ハイパラ設定
pram_lgbm = {

    'max_depth':[i for i in range(2, 20)],
    'n_estimators':[1000],
    'num_leaves':[2**i for i in range(5, 10)],
    'random_state':[123]
        
}

In [70]:
# モデル推定
# model_lgbm = GridSearchCV(
#
#            lgbm.LGBMClassifier(),
#            pram_lgbm,
#            cv=4
#)

In [71]:
# %%time
# model_lgbm.fit(X, y)

In [72]:
# model_lgbm.best_estimator_

In [73]:
# 精度の確認
# model_lgbm.best_estimator_.score(X, y)

# 　

# モデル予測
ヒット確率は特徴量の一つにする
> ・ロジスティック回帰（logit）<br>
> ・LGBM <br>

### ロジスティック回帰モデル

In [74]:
# 全体データ（not イニングデータ）
logit_score = model_logit.best_estimator_.predict_proba(all_df.filter(like="FTR_", axis=1))

In [75]:
# DFに加工
FTR_prob_logit = pd.DataFrame(
                    
                    # ヒット確率
                    logit_score[:,1]
        
                # 列名変更
                ).rename(columns={0:'FTR_prob_logit'})

FTR_prob_logit

Unnamed: 0,FTR_prob_logit
0,0.197120
1,0.076125
2,0.066053
3,0.008747
4,0.926282
...,...
54203,0.115175
54204,0.783062
54205,0.947061
54206,0.985848


### LIGHTgbm

In [76]:
# 全体データ（not イニングデータ）
# lgbm_score = model_lgbm.best_estimator_.predict_proba(all_df.filter(like="FTR_", axis=1))

In [77]:
# lgbm_score

In [78]:
# DFに加工
# FTR_prob_lgbm = pd.DataFrame(
#                    
#                    # ヒット確率
#                    lgbm_score[:,1]
#        
#                # 列名変更
#                ).rename(columns={0:'FTR_prob_lgbm'})
#
# FTR_prob_lgbm

# 　

# データ結合
> ・ヒット予測確率を特徴量として全体データ（all_df）に結合

In [79]:
# データ結合
all_FTR_df = pd.concat(

                # 順番は変わっていないので、単純に横結合
                [
                    # 全体データ
                    all_df, 
                    
                    # ロジットスコア
                    FTR_prob_logit
                    
                    # LIGHTgbm
                    # FTR_prob_lgbm
                
                ],

                axis=1
            )

In [80]:
all_FTR_df

Unnamed: 0,id,gameID,inning,batter,pitcher,y,is_hit0,is_hit1,is_hit2,is_hit3,is_hit4,flg_train,FTR_cnt_B,FTR_cnt_O,FTR_cnt_S,FTR_pting_cnt,FTR_inning,FTR_bt_btingavg00,FTR_bt_btingavg01,FTR_bt_btingavg02,FTR_bt_btingavg03,FTR_bt_btingavg04,FTR_bt_btingavg10,FTR_bt_btingavg11,FTR_bt_btingavg12,...,FTR_pt_btingavg11,FTR_pt_btingavg12,FTR_pt_btingavg13,FTR_pt_btingavg14,FTR_pt_btingavg20,FTR_pt_btingavg21,FTR_pt_btingavg22,FTR_pt_btingavg23,FTR_pt_btingavg24,FTR_pt_btingavg30,FTR_pt_btingavg31,FTR_pt_btingavg32,FTR_pt_btingavg33,FTR_pt_btingavg34,FTR_pt_btingavg40,FTR_pt_btingavg41,FTR_pt_btingavg42,FTR_pt_btingavg43,FTR_pt_btingavg44,FTR_pt_btingavg50,FTR_pt_btingavg51,FTR_pt_btingavg52,FTR_pt_btingavg53,FTR_pt_btingavg54,FTR_prob_logit
0,0,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1,0,0,0,1,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.197120
1,1,20202173,1回表,ピレラ,今永 昇太,1.0,False,False,False,False,False,1,1,0,0,2,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.076125
2,2,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1,1,0,1,3,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.111111,0.111111,0.000000,0.0,0.000000,0.066053
3,3,20202173,1回表,ピレラ,今永 昇太,2.0,False,False,False,False,False,1,2,0,1,4,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.008747
4,4,20202173,1回表,ピレラ,今永 昇太,4.0,True,True,False,False,False,1,2,0,2,5,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.500000,0.500000,0.000000,0.0,0.000000,0.926282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54203,33803,20202023,5回表,坂口 智隆,床田 寛樹,,False,False,False,False,False,0,0,0,1,2,5,0.226006,0.154799,0.043344,0.000000,0.030960,0.250000,0.200000,0.025000,...,0.714286,0.142857,0.000000,0.142857,0.666667,0.666667,0.000000,0.000000,0.000000,0.714286,0.714286,0.142857,0.0,0.142857,0.714286,0.571429,0.142857,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.115175
54204,33804,20202640,9回表,メヒア,堀岡 隼人,,False,False,False,False,False,0,0,0,0,1,9,0.269122,0.184136,0.036827,0.005666,0.042493,0.121212,0.090909,0.000000,...,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.783062
54205,33805,20202864,7回裏,鈴木 誠也,ディプラン,,False,False,False,False,False,0,0,0,0,1,7,0.269122,0.184136,0.036827,0.005666,0.042493,0.325000,0.150000,0.025000,...,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.947061
54206,33806,20202806,8回裏,周東 佑京,田村 伊知郎,,False,False,False,False,False,0,3,1,1,5,8,0.190981,0.119363,0.037135,0.005305,0.029178,0.000000,0.000000,0.000000,...,1.000000,0.500000,0.000000,0.000000,1.000000,0.500000,0.500000,0.000000,0.000000,1.000000,1.000000,0.500000,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.500000,0.0,0.000000,0.985848


# データ保存

In [81]:
all_FTR_df.to_pickle(data_path / ("%s_all_FTR_prob_df.pkl" % prefix))