In [52]:
import pandas as pd
import scipy as sc
import numpy as np
import sklearn
import pickle
import pathlib as Path
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

import multiprocessing
import itertools
import collections
import datetime
import gc

from tqdm._tqdm_notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 50)

data_path = Path.Path("../data")
result_path = Path.Path("../result")

prefix = 'ana100'

# データ抽出
> ・全体データ（ana005_all_df.pkl）

In [53]:
all_df = pd.read_pickle(data_path / "ana005_all_df.pkl")

In [54]:
all_df

Unnamed: 0,id,B,O,S,b1,b2,b3,ballPositionLabel,ballX,ballY,batter,batterHand,battingType,dir,dist,flg_train,gameID,inning,isOuts,pitchType,pitcher,pitcherHand,speed,totalPitchingCount,y,topTeam,bottomTeam,batterTeam,pitcherTeam,is_hit0,is_hit1,is_hit2,is_hit3,is_hit4
0,0,0,0,0,False,False,False,内角低め,17.0,J,ピレラ,R,,,,1,20202173,1回表,,ストレート,今永 昇太,L,149km/h,1,0.0,広島,DeNA,広島,DeNA,False,False,False,False,False
1,1,1,0,0,False,False,False,内角低め,14.0,I,ピレラ,R,,,,1,20202173,1回表,,ストレート,今永 昇太,L,149km/h,2,1.0,広島,DeNA,広島,DeNA,False,False,False,False,False
2,2,1,0,1,False,False,False,外角高め,8.0,D,ピレラ,R,,,,1,20202173,1回表,,チェンジアップ,今永 昇太,L,137km/h,3,0.0,広島,DeNA,広島,DeNA,False,False,False,False,False
3,3,2,0,1,False,False,False,内角中心,21.0,G,ピレラ,R,,,,1,20202173,1回表,,スライダー,今永 昇太,L,138km/h,4,2.0,広島,DeNA,広島,DeNA,False,False,False,False,False
4,4,2,0,2,False,False,False,外角中心,7.0,F,ピレラ,R,G,S,38.3,1,20202173,1回表,False,チェンジアップ,今永 昇太,L,136km/h,5,4.0,広島,DeNA,広島,DeNA,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54203,33803,0,0,1,False,False,False,,,,坂口 智隆,L,,,,0,20202023,5回表,,,床田 寛樹,L,,2,,ヤクルト,広島,ヤクルト,広島,False,False,False,False,False
54204,33804,0,0,0,False,False,False,,,,メヒア,R,,,,0,20202640,9回表,,,堀岡 隼人,R,,1,,広島,巨人,広島,巨人,False,False,False,False,False
54205,33805,0,0,0,True,False,False,,,,鈴木 誠也,R,,,,0,20202864,7回裏,,,ディプラン,R,,1,,巨人,広島,広島,巨人,False,False,False,False,False
54206,33806,3,1,1,False,True,False,,,,周東 佑京,L,,,,0,20202806,8回裏,,,田村 伊知郎,R,,5,,西武,ソフトバンク,ソフトバンク,西武,False,False,False,False,False


# 必要な列定義
> ・id <br>
> ・ボールカウント <br>
> ・投球数 <br>

In [55]:
cols = ['id', 'B', 'O', 'S', 'totalPitchingCount', 'inning']
cols

['id', 'B', 'O', 'S', 'totalPitchingCount', 'inning']

In [56]:
# 必要な列のみ抽出
_all_df = all_df[cols].copy()
_all_df

Unnamed: 0,id,B,O,S,totalPitchingCount,inning
0,0,0,0,0,1,1回表
1,1,1,0,0,2,1回表
2,2,1,0,1,3,1回表
3,3,2,0,1,4,1回表
4,4,2,0,2,5,1回表
...,...,...,...,...,...,...
54203,33803,0,0,1,2,5回表
54204,33804,0,0,0,1,9回表
54205,33805,0,0,0,1,7回裏
54206,33806,3,1,1,5,8回裏


# 　

# 特徴量作成
> ・ボールカウント（B O S）⇒名称変更のみ <br>
> ・イニング（inning）⇒数値が必要<br>
> ・投球数（totalPitchingCount）⇒名称変更のみ<br>

In [68]:
# イニングの文字列処理
import re

def get_inning(var):
    
    """
        Agrs:
            var：対象列名
        
        Return:
            string
    
    """
    
    pattern = "(\d+).*"
    
    result = re.match(pattern, var)
    
    if result:
        
        return int(result.group(1))

In [69]:
_all_df['FTR_inning'] = _all_df['inning'].apply(lambda x : get_inning(x))

In [70]:
_all_df

Unnamed: 0,id,B,O,S,totalPitchingCount,inning,FTR_inning
0,0,0,0,0,1,1回表,1
1,1,1,0,0,2,1回表,1
2,2,1,0,1,3,1回表,1
3,3,2,0,1,4,1回表,1
4,4,2,0,2,5,1回表,1
...,...,...,...,...,...,...,...
54203,33803,0,0,1,2,5回表,5
54204,33804,0,0,0,1,9回表,9
54205,33805,0,0,0,1,7回裏,7
54206,33806,3,1,1,5,8回裏,8


In [71]:
_all_df.dtypes

id                     int64
B                      int64
O                      int64
S                      int64
totalPitchingCount     int64
inning                object
FTR_inning             int64
dtype: object

In [72]:
# 確認用
_all_df[['inning', 'FTR_inning']].groupby(['inning', 'FTR_inning']).count()

inning,FTR_inning
1回表,1
1回裏,1
2回表,2
2回裏,2
3回表,3
3回裏,3
4回表,4
4回裏,4
5回表,5
5回裏,5


☝：イニングの処理よし！

# 　

# 名称変更

In [73]:
# 特徴量を抽出
_tmp = _all_df.rename(columns={
                
                    # ボールカウント
                    'B':'FTR_cnt_B', 
                    'O':'FTR_cnt_O',
                    'S':'FTR_cnt_S',

                    # 投球数
                    'totalPitchingCount':'FTR_pting_cnt'
          
            }
               
        ).filter(like='FTR', axis=1)

# idを追加
_tmp['id'] = _all_df['id']

In [74]:
_tmp

Unnamed: 0,FTR_cnt_B,FTR_cnt_O,FTR_cnt_S,FTR_pting_cnt,FTR_inning,id
0,0,0,0,1,1,0
1,1,0,0,2,1,1
2,1,0,1,3,1,2
3,2,0,1,4,1,3
4,2,0,2,5,1,4
...,...,...,...,...,...,...
54203,0,0,1,2,5,33803
54204,0,0,0,1,9,33804
54205,0,0,0,1,7,33805
54206,3,1,1,5,8,33806


# 　

# データ保存

In [75]:
_tmp.to_pickle(data_path / ("%s_FTR_baseinf.pkl"  % prefix))