In [1]:
import pandas as pd
import scipy as sc
import numpy as np
import sklearn
import pickle
import pathlib as Path
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

import multiprocessing
import itertools
import collections
import datetime
import gc

from tqdm._tqdm_notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 50)

data_path = Path.Path("../data")
result_path = Path.Path("../result")

prefix = 'ana201'

# データ抽出
> ・軸データ：データ加工済み（ana005_all_df.pkl）<br>
> ・特徴量 <br>
> ：基礎情報 <br>
> ：打者打率 <br>
> ：投手被打率 <br>

In [2]:
# 軸データ
all_df = pd.read_pickle(data_path / "ana005_all_df.pkl")
all_df

Unnamed: 0,id,B,O,S,b1,b2,b3,ballPositionLabel,ballX,ballY,batter,batterHand,battingType,dir,dist,flg_train,gameID,inning,isOuts,pitchType,pitcher,pitcherHand,speed,totalPitchingCount,y,topTeam,bottomTeam,batterTeam,pitcherTeam,is_hit0,is_hit1,is_hit2,is_hit3,is_hit4
0,0,0,0,0,False,False,False,内角低め,17.0,J,ピレラ,R,,,,1,20202173,1回表,,ストレート,今永 昇太,L,149km/h,1,0.0,広島,DeNA,広島,DeNA,False,False,False,False,False
1,1,1,0,0,False,False,False,内角低め,14.0,I,ピレラ,R,,,,1,20202173,1回表,,ストレート,今永 昇太,L,149km/h,2,1.0,広島,DeNA,広島,DeNA,False,False,False,False,False
2,2,1,0,1,False,False,False,外角高め,8.0,D,ピレラ,R,,,,1,20202173,1回表,,チェンジアップ,今永 昇太,L,137km/h,3,0.0,広島,DeNA,広島,DeNA,False,False,False,False,False
3,3,2,0,1,False,False,False,内角中心,21.0,G,ピレラ,R,,,,1,20202173,1回表,,スライダー,今永 昇太,L,138km/h,4,2.0,広島,DeNA,広島,DeNA,False,False,False,False,False
4,4,2,0,2,False,False,False,外角中心,7.0,F,ピレラ,R,G,S,38.3,1,20202173,1回表,False,チェンジアップ,今永 昇太,L,136km/h,5,4.0,広島,DeNA,広島,DeNA,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54203,33803,0,0,1,False,False,False,,,,坂口 智隆,L,,,,0,20202023,5回表,,,床田 寛樹,L,,2,,ヤクルト,広島,ヤクルト,広島,False,False,False,False,False
54204,33804,0,0,0,False,False,False,,,,メヒア,R,,,,0,20202640,9回表,,,堀岡 隼人,R,,1,,広島,巨人,広島,巨人,False,False,False,False,False
54205,33805,0,0,0,True,False,False,,,,鈴木 誠也,R,,,,0,20202864,7回裏,,,ディプラン,R,,1,,巨人,広島,広島,巨人,False,False,False,False,False
54206,33806,3,1,1,False,True,False,,,,周東 佑京,L,,,,0,20202806,8回裏,,,田村 伊知郎,R,,5,,西武,ソフトバンク,ソフトバンク,西武,False,False,False,False,False


In [3]:
# 特徴量
# ※単純に横結合する予定なので、id列は重複するため削除

# 基本情報
FTR_base = pd.read_pickle(data_path / 'ana100_FTR_baseinf.pkl').drop('id', axis=1)

# 打者打率
FTR_bt = pd.read_pickle(data_path / 'ana101_FTR_bt_btingavg.pkl').drop('id', axis=1)

# 投手被打率
FTR_pt = pd.read_pickle(data_path / 'ana102_FTR_pt_btingavg.pkl').drop('id', axis=1)

# 　

# 必要な列定義
> ・軸データ <br>
> ：id, 試合ID（gameID）, イニング（inning）,目的変数(y）, 疑似目的変数たち（is_hit0~4）<br>
> ：バッター、投手 <br>
> ：データ分割フラグ（flg_train）
> <br>
> ※必要になり次第追加予定

In [4]:
cols = ['id', 'gameID', 'inning', 'batter', 'pitcher', 'y', 'is_hit0', 'is_hit1', 'is_hit2', 'is_hit3', 'is_hit4', 'flg_train']
cols

['id',
 'gameID',
 'inning',
 'batter',
 'pitcher',
 'y',
 'is_hit0',
 'is_hit1',
 'is_hit2',
 'is_hit3',
 'is_hit4',
 'flg_train']

In [5]:
# 軸データの抽出
base = all_df[cols]
base

Unnamed: 0,id,gameID,inning,batter,pitcher,y,is_hit0,is_hit1,is_hit2,is_hit3,is_hit4,flg_train
0,0,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1
1,1,20202173,1回表,ピレラ,今永 昇太,1.0,False,False,False,False,False,1
2,2,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1
3,3,20202173,1回表,ピレラ,今永 昇太,2.0,False,False,False,False,False,1
4,4,20202173,1回表,ピレラ,今永 昇太,4.0,True,True,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...
54203,33803,20202023,5回表,坂口 智隆,床田 寛樹,,False,False,False,False,False,0
54204,33804,20202640,9回表,メヒア,堀岡 隼人,,False,False,False,False,False,0
54205,33805,20202864,7回裏,鈴木 誠也,ディプラン,,False,False,False,False,False,0
54206,33806,20202806,8回裏,周東 佑京,田村 伊知郎,,False,False,False,False,False,0


# 　

# データ結合
> ・特徴量の結合 <br>
> ※インデックス結合

In [6]:
all_df_FTR = pd.concat(
    
                # レコード構成は同じなので単純に横結合
                [base, FTR_base, FTR_bt, FTR_pt], axis=1

            # カウント別特徴量を削除
            # ).drop(
    
            #    ["FTR_bt_btingavg5"+str(i) for i in range(5)], axis=1

            # ).drop(
        
            #    ["FTR_pt_btingavg5"+str(i) for i in range(5)], axis=1
            )
all_df_FTR

Unnamed: 0,id,gameID,inning,batter,pitcher,y,is_hit0,is_hit1,is_hit2,is_hit3,is_hit4,flg_train,FTR_cnt_B,FTR_cnt_O,FTR_cnt_S,FTR_pting_cnt,FTR_inning,FTR_bt_btingavg00,FTR_bt_btingavg01,FTR_bt_btingavg02,FTR_bt_btingavg03,FTR_bt_btingavg04,FTR_bt_btingavg10,FTR_bt_btingavg11,FTR_bt_btingavg12,...,FTR_pt_btingavg10,FTR_pt_btingavg11,FTR_pt_btingavg12,FTR_pt_btingavg13,FTR_pt_btingavg14,FTR_pt_btingavg20,FTR_pt_btingavg21,FTR_pt_btingavg22,FTR_pt_btingavg23,FTR_pt_btingavg24,FTR_pt_btingavg30,FTR_pt_btingavg31,FTR_pt_btingavg32,FTR_pt_btingavg33,FTR_pt_btingavg34,FTR_pt_btingavg40,FTR_pt_btingavg41,FTR_pt_btingavg42,FTR_pt_btingavg43,FTR_pt_btingavg44,FTR_pt_btingavg50,FTR_pt_btingavg51,FTR_pt_btingavg52,FTR_pt_btingavg53,FTR_pt_btingavg54
0,0,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1,0,0,0,1,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.538462,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,1,20202173,1回表,ピレラ,今永 昇太,1.0,False,False,False,False,False,1,1,0,0,2,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.538462,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,2,20202173,1回表,ピレラ,今永 昇太,0.0,False,False,False,False,False,1,1,0,1,3,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.538462,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.111111,0.111111,0.000000,0.0,0.000000
3,3,20202173,1回表,ピレラ,今永 昇太,2.0,False,False,False,False,False,1,2,0,1,4,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.538462,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,4,20202173,1回表,ピレラ,今永 昇太,4.0,True,True,False,False,False,1,2,0,2,5,1,0.269122,0.184136,0.036827,0.005666,0.042493,0.333333,0.238095,0.047619,...,0.538462,0.461538,0.076923,0.153846,0.000000,0.538462,0.461538,0.000000,0.076923,0.000000,0.800000,0.600000,0.200000,0.4,0.000000,0.538462,0.384615,0.000000,0.153846,0.000000,0.500000,0.500000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54203,33803,20202023,5回表,坂口 智隆,床田 寛樹,,False,False,False,False,False,0,0,0,1,2,5,0.226006,0.154799,0.043344,0.000000,0.030960,0.250000,0.200000,0.025000,...,0.714286,0.714286,0.142857,0.000000,0.142857,0.666667,0.666667,0.000000,0.000000,0.000000,0.714286,0.714286,0.142857,0.0,0.142857,0.714286,0.571429,0.142857,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
54204,33804,20202640,9回表,メヒア,堀岡 隼人,,False,False,False,False,False,0,0,0,0,1,9,0.269122,0.184136,0.036827,0.005666,0.042493,0.121212,0.090909,0.000000,...,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111
54205,33805,20202864,7回裏,鈴木 誠也,ディプラン,,False,False,False,False,False,0,0,0,0,1,7,0.269122,0.184136,0.036827,0.005666,0.042493,0.325000,0.150000,0.025000,...,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111,0.505051,0.404040,0.121212,0.000000,0.111111,0.505051,0.404040,0.121212,0.0,0.111111
54206,33806,20202806,8回裏,周東 佑京,田村 伊知郎,,False,False,False,False,False,0,3,1,1,5,8,0.190981,0.119363,0.037135,0.005305,0.029178,0.000000,0.000000,0.000000,...,1.000000,1.000000,0.500000,0.000000,0.000000,1.000000,0.500000,0.500000,0.000000,0.000000,1.000000,1.000000,0.500000,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.500000,0.0,0.000000


In [7]:
# 念のため欠損値確認
list(all_df_FTR.isnull().sum())

[0,
 0,
 0,
 5,
 1,
 33808,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

☝：投手・バッターの欠損値は想定内（もともと）<br>
　：テストデータの目的変数はnp.nanでOK

# データの保存

In [8]:
all_df_FTR.to_pickle(data_path / ('%s_all_FTR_df.pkl' % prefix))