## 데이터 전처리

In [21]:
import pandas as pd
import numpy as np

### 1. 크롤링된 데이터 불러오기 및 전처리

#### 1-1. df_midterm (경기결과 + 날씨)

##### 1) 데이터 불러오기

In [22]:
df_midterm = pd.read_excel('numerized_merge3.xlsx')
df_midterm.tail()

Unnamed: 0,label,ymd,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도
2142,1.0,20201030,6,18.5,2,0,3,1,12.4,0.0,7.6,42.3
2143,0.0,20201030,4,18.5,5,0,7,1,15.0,0.0,11.9,57.3
2144,0.0,20201030,8,18.5,4,0,6,1,11.7,0.0,6.8,70.1
2145,1.0,20201030,9,18.5,1,0,8,1,10.9,0.0,3.6,68.1
2146,1.0,20201031,5,17.0,4,0,5,1,13.6,0.0,2.5,59.1


##### 2) 데이터 전처리

In [23]:
# KIA 이외 경기 제거
idx_num = df_midterm[(df_midterm['team0'] != 5) & (df_midterm['team1'] != 5)].index
df_midterm = df_midterm.drop(idx_num)
df_midterm = df_midterm.reset_index(drop=True) # reorder index

print('Length of dataframe KIA: ' + str(len(df_midterm)))
print('Removed data length : ' + str(len(idx_num)))

Length of dataframe KIA: 432
Removed data length : 1715


In [24]:
# 팀0 에 KIA (5), 팀1 에 상대팀 번호로 데이터 정리

# 팀0가 KIA인 dataframe
df_midterm0 = df_midterm[df_midterm['team0'] == 5].copy()
# 팀1이 KIA인 dataframe
df_midterm1 = df_midterm[df_midterm['team1'] == 5].copy()
# .copy()는 SettingWithCopyWarning을 없애기 위해

# df_midterm1의 순서 변경 (5를 team0로)
# label 값 변경
df_midterm1.loc[:, 'label'] += 1
df_midterm1.loc[df_midterm1['label'] == 2, 'label'] -= 2

# H/A0, H/A1 값 변경
df_midterm1.loc[:, 'H/A0'] += 1
df_midterm1.loc[df_midterm1['H/A0'] == 2, 'H/A0'] -= 2

df_midterm1.loc[:, 'H/A1'] += 1
df_midterm1.loc[df_midterm1['H/A1'] == 2, 'H/A1'] -= 2

# 팀0, 팀1 변경
df_midterm1.loc[:, 'team1'] = df_midterm1.loc[:, 'team0']
df_midterm1.loc[:, 'team0'] = 5

# df_midterm = df_midterm0 + df_midterm1
df_midterm = df_midterm0.append(df_midterm1)

df_midterm1.head()

Unnamed: 0,label,ymd,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도
0,1.0,20180324,5,14.0,5,1,1,0,9.8,0.0,4.0,65.0
1,0.0,20180325,5,14.0,5,1,1,0,12.0,0.0,5.0,75.1
5,0.0,20180330,1,18.5,5,0,2,1,14.8,0.0,6.1,55.6
6,1.0,20180331,1,17.0,5,0,2,1,14.0,0.0,6.8,52.9
7,1.0,20180401,1,14.0,5,0,2,1,15.6,0.0,5.4,68.8


In [25]:
# year 데이터 추가 (date에서 year만 추출)
df_midterm.rename(columns = {'ymd':'date'}, inplace = True) # ymd를 date로 변경

df_midterm['year'] = pd.to_datetime(df_midterm['date'].astype(str), format='%Y%m%d')
df_midterm['year'] = df_midterm['year'].dt.year

df_midterm = df_midterm.sort_index() # index 오름차순으로 정렬

df_midterm.head()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도,year
0,1.0,20180324,5,14.0,5,1,1,0,9.8,0.0,4.0,65.0,2018
1,0.0,20180325,5,14.0,5,1,1,0,12.0,0.0,5.0,75.1,2018
2,0.0,20180327,5,18.5,5,1,6,0,14.5,0.0,5.0,63.1,2018
3,1.0,20180328,5,18.5,5,1,6,0,15.6,0.0,6.5,70.1,2018
4,0.0,20180329,5,18.5,5,1,6,0,17.0,0.0,4.3,71.8,2018


In [26]:
# 데이터 정리
# H/A1 : H/A0와 대칭되는 값 (ex. H/A0가 0이면 H/A1이 1)
df_midterm = df_midterm.drop(['H/A1'], axis = 1)

df_midterm.head()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,상대습도,year
0,1.0,20180324,5,14.0,5,1,1,9.8,0.0,4.0,65.0,2018
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,75.1,2018
2,0.0,20180327,5,18.5,5,1,6,14.5,0.0,5.0,63.1,2018
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,70.1,2018
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,71.8,2018


#### 1-2. df_entry (선발라인업)

In [27]:
df_entry = pd.read_excel('player_entry.xlsx')
df_entry.tail()

Unnamed: 0.1,Unnamed: 0,team,date,player1,position1,type1,player2,position2,type2,player3,...,type8,player9,position9,type9,pitcher,position,type,result,opponent_p,type.1
4459,4459,18 kt,2018-03-29,오태곤,LF,R,강백호,DH,L,로하스,...,R,정현,SS,R,류희운,P,우투,kt 7:1 SK,문승원,우투
4460,4460,18 kt,2018-03-28,이진영,DH,L,강백호,LF,L,로하스,...,R,정현,SS,R,금민철,P,좌투,kt 8:5 SK,박종훈,우언
4461,4461,18 kt,2018-03-27,심우준,SS,R,박경수,2B,R,로하스,...,L,장성우,C,R,고영표,P,우언,kt 5:8 SK,산체스,우투
4462,4462,18 kt,2018-03-25,정현,SS,R,오정복,DH,R,로하스,...,L,장성우,C,R,주권,P,우투,kt 1:14 KIA,양현종,좌투
4463,4463,18 kt,2018-03-24,심우준,SS,R,이진영,DH,L,로하스,...,L,장성우,C,R,피어밴드,P,좌투,kt 5:4 KIA,헥터,우투


##### 2) 데이터 전처리

In [28]:
df_entry = df_entry.iloc[:,1:-3] # opponent pitcher, index(unnamed:0), result 제거

In [29]:
# dummy data 제거 + 정규경기 이외 경기 제거
# 특징 : pitcher~type column 값이 모두 같은 값을 가짐
idx_num = df_entry[df_entry['pitcher'] == df_entry['position']].index
df_entry = df_entry.drop(idx_num)

print('Length after removing dummy : ' + str(len(df_entry)))
print('Removed data length : ' + str(len(idx_num)))

Length after removing dummy : 4323
Removed data length : 141


In [30]:
# 필요없는 데이터 제거
# pitcher position 제거 (p로 고정되어 필요없음)
df_entry = df_entry.drop('position', axis = 1)

# player position 제거
pp = []
for i in range(9): # player postion 값
    pp.append('position' + str(i+1))
df_entry = df_entry.drop(pp, axis = 1)

df_entry.head()

Unnamed: 0,team,date,player1,type1,player2,type2,player3,type3,player4,type4,...,player6,type6,player7,type7,player8,type8,player9,type9,pitcher,type
0,20 KIA,2020-10-31,최정민,L,이진영,R,황대인,R,이우성,R,...,최정용,L,백용환,R,김규성,L,김호령,R,임기영,우언
1,20 KIA,2020-10-30,최원준,L,김선빈,R,터커,L,최형우,L,...,김민식,L,김태진,L,유민상,L,박찬호,R,가뇽,우투
2,20 KIA,2020-10-29,최원준,L,김선빈,R,터커,L,최형우,L,...,황대인,R,한승택,R,김태진,L,박찬호,R,양현종,좌투
3,20 KIA,2020-10-28,최원준,L,김선빈,R,터커,L,최형우,L,...,김민식,L,김태진,L,유민상,L,박찬호,R,김현수,우투
4,20 KIA,2020-10-27,최원준,L,김선빈,R,터커,L,최형우,L,...,김태진,L,김민식,L,유민상,L,박찬호,R,장현식,우투


In [31]:
# type을 column하나로 정리 (LR ratio)
# LR ratio(초기값 all zero)에서 선수 type이 L이면 += 1, 아니면 += 0
# tp = [type1, type2, type3, ...]
df_entry['LR ratio'], tp = 0, []
for i in range(9):
    tp.append('type' + str(i+1))
    df_entry.loc[df_entry[tp[i]] == 'L', 'LR ratio'] += 1
df_entry['LR ratio'] = df_entry['LR ratio'].div(9)

df_entry = df_entry.drop(tp, axis = 1)

df_entry.head()

Unnamed: 0,team,date,player1,player2,player3,player4,player5,player6,player7,player8,player9,pitcher,type,LR ratio
0,20 KIA,2020-10-31,최정민,이진영,황대인,이우성,한승택,최정용,백용환,김규성,김호령,임기영,우언,0.333333
1,20 KIA,2020-10-30,최원준,김선빈,터커,최형우,나지완,김민식,김태진,유민상,박찬호,가뇽,우투,0.666667
2,20 KIA,2020-10-29,최원준,김선빈,터커,최형우,나지완,황대인,한승택,김태진,박찬호,양현종,좌투,0.444444
3,20 KIA,2020-10-28,최원준,김선빈,터커,최형우,나지완,김민식,김태진,유민상,박찬호,김현수,우투,0.666667
4,20 KIA,2020-10-27,최원준,김선빈,터커,최형우,나지완,김태진,김민식,유민상,박찬호,장현식,우투,0.666667


In [32]:
# date 값 변경 (df_midterm의 date 형태로)
df_entry['date'] = df_entry['date'].str.replace("-","").astype(int)

In [33]:
# team 이름 변경 (int 형태로)

# df_entry의 team_name 확인
team_list = df_entry['team'].unique().tolist()
print('team_list : ' + str(team_list) + '   , len(' + str(len(team_list)) + ')')

# team_name이 key, team_num이 value인 dictionary  dict_team 생성
# 1~10 : KT, LG, SK, NC, KIA, 삼성, 롯데, 한화, 두산, 키움
# team_name = [KIA, 삼성, 롯데, 두산, Sk, LG, 한화, NC, 키움, KT]
team_num = [5, 6, 7, 9, 3, 2, 8, 4, 10, 1] # team_name 과 순서 동일
dict_team = {}
for i in range(10):
    for idx in range(3):
        idx = i * 3 + idx
        dict_team[team_list[idx]] = team_num[i]

# df_entry의 team이름 변환
for i in team_list:
    df_entry.loc[df_entry['team'] == i, 'team'] = dict_team[i]

team_list : ['20 KIA', '19 KIA', '18 KIA', '20 삼성', '19 삼성', '18 삼성', '20 롯데', '19 롯데', '18 롯데', '20 두산', '19 두산', '18 두산', '20 SK', '19 SK', '18 SK', '20 LG', '19 LG', '18 LG', '20 한화', '19 한화', '18 한화', '20 NC', '19 NC', '18 NC', '20 키움', '19 키움', '18 넥센', '20 KT', '19 KT', '18 kt']   , len(30)


In [34]:
# 팀 별로 dataframe 분할
# df_entry_team : kia 경기 entry
df_entry_team = df_entry[df_entry['team'] == 5].copy()
df_entry_team.rename(columns = {'team' : 'team0'}, inplace = True)

# df_entry_opponent : kia 를 제외한 경기 entry
df_entry_opponent = df_entry[df_entry['team'] != 5].copy()
entry_opponent_rename_dict = {}
for i in range(10):
    if i != 9:
        entry_opponent_rename_dict['player' + str(i+1)] = 'opponent player' + str(i+1)
    else:
        entry_opponent_rename_dict['pitcher'] = 'opponent pitcher'
        entry_opponent_rename_dict['type'] = 'opponent type'
        entry_opponent_rename_dict['LR ratio'] = 'opponent LR ratio'
        
df_entry_opponent.rename(columns = entry_opponent_rename_dict, inplace = True)
df_entry_opponent.rename(columns = {'team' : 'team1'}, inplace = True)

df_entry_opponent.tail()

Unnamed: 0,team1,date,opponent player1,opponent player2,opponent player3,opponent player4,opponent player5,opponent player6,opponent player7,opponent player8,opponent player9,opponent pitcher,opponent type,opponent LR ratio
4459,1,20180329,오태곤,강백호,로하스,황재균,유한준,윤석민,박경수,장성우,정현,류희운,우투,0.111111
4460,1,20180328,이진영,강백호,로하스,황재균,유한준,윤석민,박경수,장성우,정현,금민철,좌투,0.222222
4461,1,20180327,심우준,박경수,로하스,윤석민,황재균,유한준,오태곤,강백호,장성우,고영표,우언,0.111111
4462,1,20180325,정현,오정복,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,주권,우투,0.111111
4463,1,20180324,심우준,이진영,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,피어밴드,좌투,0.222222


In [35]:
df_entry.tail()

Unnamed: 0,team,date,player1,player2,player3,player4,player5,player6,player7,player8,player9,pitcher,type,LR ratio
4459,1,20180329,오태곤,강백호,로하스,황재균,유한준,윤석민,박경수,장성우,정현,류희운,우투,0.111111
4460,1,20180328,이진영,강백호,로하스,황재균,유한준,윤석민,박경수,장성우,정현,금민철,좌투,0.222222
4461,1,20180327,심우준,박경수,로하스,윤석민,황재균,유한준,오태곤,강백호,장성우,고영표,우언,0.111111
4462,1,20180325,정현,오정복,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,주권,우투,0.111111
4463,1,20180324,심우준,이진영,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,피어밴드,좌투,0.222222


#### 1-3. df_pitcher (투수 데이터)

##### 1) 데이터 불러오기

In [36]:
df_pitcher_situ = pd.read_excel('pitchers_situation.xlsx')
df_pitcher_situ.head()

Unnamed: 0,팀명,이름,구분,H,2B,3B,HR,BB,HBP,SO,WP,BK,AVG
0,두산,배창현,좌타자,-,-,-,-,-,-,-,-,-,-
1,두산,배창현,우타자,0,0,0,0,2,0,0,0,0,0.000
2,두산,이동원,좌타자,0,0,0,0,1,0,0,0,0,-
3,두산,이동원,우타자,0,0,0,0,1,0,0,0,0,-
4,두산,알칸타라,좌타자,74,18,1,4,16,2,86,1,1,0.226


In [37]:
df_pitcher_tot = pd.read_excel('pitchers_total.xlsx')
df_pitcher_tot.head()

Unnamed: 0,팀명,이름,연도,ERA,G,CG,SHO,W,L,SV,...,WPCT,TBF,IP,H,HR,BB,HBP,SO,R,ER
0,두산,배창현,2018,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,두산,배창현,2019,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,두산,배창현,2020,0.00,1,0,0,0,0,0,...,-,3,1/3,0,0,2,0,0,0,0
3,두산,이동원,2018,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,두산,이동원,2019,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


##### 2) 데이터 전처리

In [38]:
dict_team_num = {'KT':'1', 'LG':'2', 'SK':'3', 'NC':'4', 'KIA':'5', '삼성':'6',
                 '롯데':'7', '한화':'8', '두산':'9', '넥센':'10', '키움':'10', '드림':'11', '나눔':'12'}

# 1. 팀명 번호로 변경 #
for team in dict_team_num.keys():
    df_pitcher_situ['팀명'] = df_pitcher_situ['팀명'].replace(team, dict_team_num[team])
    df_pitcher_tot['팀명'] = df_pitcher_tot['팀명'].replace(team, dict_team_num[team])
###################

In [39]:
# 임시 파일 만드는 이유 : 서로 다른 팀인 동명이인이 존재할지도 몰라서...  {'김현수' -> '2김현수'}
df_pitcher_situ['임시'] = df_pitcher_situ['팀명']+df_pitcher_situ['이름']
df_pitcher_tot['임시'] = df_pitcher_tot['팀명']+df_pitcher_tot['이름']
print('타자유형별 기록 :', len(df_pitcher_situ), ',  Nan :', df_pitcher_situ.isnull().sum()[3])
print('통산 기록 :', len(df_pitcher_tot), ',  Nan :', df_pitcher_tot.isnull().sum()[3])
print(len(df_pitcher_situ['이름'].unique().tolist()))

타자유형별 기록 : 928 ,  Nan : 0
통산 기록 : 1392 ,  Nan : 0
418


In [40]:
# 2. 선발 투수들에 대한 정보만 따로 추리기 #
# df_entry로부터 선발투수들에 대한 정보를 '2김현수'와같이 temp_list에 저장한다. (+중복제거)
temp_list = []
for i,j in zip(df_entry['team'], df_entry['pitcher']):
    temp_list.append(str(i)+j)
temp_list = list(np.unique(np.array(temp_list)))

# df_pitcher_~~ 데이터프레임 중 선발투수만 뽑는다. (temp_list에 있는 '팀번호+이름'에 대한 row만 남기기)
df_pitcher_situ = df_pitcher_situ[df_pitcher_situ['임시'].isin(temp_list)]
df_pitcher_tot = df_pitcher_tot[df_pitcher_tot['임시'].isin(temp_list)]
###############################

In [41]:
df_pitcher_situ.tail()
print('타자유형별 기록 :', len(df_pitcher_situ), ',  Nan :', df_pitcher_situ.isnull().sum()[3])
print('통산 기록 :', len(df_pitcher_tot), ',  Nan :', df_pitcher_tot.isnull().sum()[3])
print(len(df_pitcher_situ['이름'].unique().tolist()))

타자유형별 기록 : 444 ,  Nan : 0
통산 기록 : 666 ,  Nan : 0
211


In [42]:
# df_entry 의 선발투수중 df_pitcher_situ에 없는 선발투수 이름
list1 = df_entry['pitcher'].unique().tolist()
list2 = df_pitcher_situ['이름'].unique().tolist()
for elem in list1:
    if not (elem in list2):
        print(elem)

김정훈


In [43]:
# 3. replace from '-' to nan #
df_pitcher_situ = df_pitcher_situ.replace('-', np.nan)
df_pitcher_tot = df_pitcher_tot.replace('-', np.nan)

In [44]:
# 4. '임시' 컬럼 제거 #
df_pitcher_situ = df_pitcher_situ.drop('임시', axis=1)
df_pitcher_tot = df_pitcher_tot.drop('임시', axis=1)
###################

print('타자유형별 기록 :', len(df_pitcher_situ), ',  Nan :', df_pitcher_situ.isnull().sum()[3])
print('통산 기록 :', len(df_pitcher_tot), ',  Nan :', df_pitcher_tot.isnull().sum()[3])
print(len(df_pitcher_situ['이름'].unique().tolist()))

타자유형별 기록 : 444 ,  Nan : 130
통산 기록 : 666 ,  Nan : 276
211


In [45]:
# 5. situation dataframe의 column name 형식 변경
# ex) 팀명 이름 구분  H ~ AVG  -> 팀명 이름 H_x ~ AVG_x H_y ~ AVG_y로
left_data = np.array(df_pitcher_situ.columns[3:], copy=True)
right_data = np.array(left_data, copy=True)
for i in range(len(left_data)):
    left_data[i] += '_x'
    right_data[i] += '_y'
# left_data = ['H_x', '2B_x', '3B_x', 'HR_x', 'BB_x', 'HBP_x', 'SO_x', 'WP_x', 'BK_x', 'AVG_x']
# right_data = ['H_y', '2B_y', '3B_y', 'HR_y', 'BB_y', 'HBP_y', 'SO_y', 'WP_y', 'BK_y', 'AVG_y']

#좌타자에 대한 데이터프레임
df_pitcher_left = df_pitcher_situ[df_pitcher_situ['구분']=='좌타자']
df_pitcher_left = df_pitcher_left.drop('구분', axis='columns')
df_pitcher_left.columns = np.append(['팀명', '이름'], left_data)

#우타자에 대한 데이터프레임
df_pitcher_right = df_pitcher_situ[df_pitcher_situ['구분']=='우타자']
df_pitcher_right = df_pitcher_right.drop('구분', axis='columns')
df_pitcher_right.columns = np.append(['팀명', '이름'], right_data)

#좌타자 우타자에 대한 데이터 프레임 합치기
df_pitcher_situ = df_pitcher_situ[['팀명', '이름']].drop_duplicates(['팀명','이름'])
df_pitcher_situ = pd.merge(df_pitcher_situ, df_pitcher_left, on = ['팀명', '이름'], how='left')
df_pitcher_situ = pd.merge(df_pitcher_situ, df_pitcher_right, on = ['팀명', '이름'], how='left')

In [46]:
# 6. KIA 투수 데이터프레임와 나머지 팀의 투수 데이터프레임으로 쪼개기
df_pitcher_situ_5 = df_pitcher_situ[df_pitcher_situ['팀명']=='5']
df_pitcher_situ_n = df_pitcher_situ[df_pitcher_situ['팀명']!='5']

df_pitcher_tot_5 = df_pitcher_tot[df_pitcher_tot['팀명']=='5']
df_pitcher_tot_n = df_pitcher_tot[df_pitcher_tot['팀명']!='5']

# KIA 이외의 팀 선발 투수 컬럼 변경 ex) 'H_x' -> 'opponent H_x'
for i in range(len(left_data)):
    left_data[i] = 'opponent '+left_data[i]
    right_data[i] = 'opponent '+right_data[i]

tot_data = np.array(df_pitcher_tot.columns[3:], copy=True)
for i in range(len(tot_data)):
    tot_data[i] = 'opponent '+tot_data[i]
# left_data = ['opponent H_x', 'opponent 2B_x', 'opponent 3B_x', 'opponent HR_x', 'opponent BB_x', 'opponent HBP_x', 'opponent SO_x', 'opponent WP_x', 'opponent BK_x', 'opponent AVG_x']
# right_data = ['opponent H_y', 'opponent 2B_y', 'opponent 3B_y', 'opponent HR_y', 'opponent BB_y', 'opponent HBP_y', 'opponent SO_y', 'opponent WP_y', 'opponent BK_y', 'opponent AVG_y']
# tot_data = ['opponent ERA', 'opponent G', 'opponent CG', 'opponent SHO', 'opponent W', 'opponent L', 'opponent SV', 'opponent HLD', 'opponent WPCT', 'opponent TBF', 'opponent IP', 'opponent H', 'opponent HR', 'opponent BB', 'opponent HBP', 'opponent SO', 'opponent R', 'opponent ER']
df_pitcher_situ_n.columns = np.append(np.append(['팀명', '이름'], left_data), right_data)
df_pitcher_tot_n.columns = np.append(['팀명', '이름', '연도'], tot_data)


In [47]:
# 7. df_midterm의 colum명에 맞추기 (+ data type 맞추기)
df_pitcher_situ_5 = df_pitcher_situ_5.rename(columns={'팀명':'team0', '이름':'pitcher'})
df_pitcher_situ_5['team0'] = df_pitcher_situ_5['team0'].astype(float)

df_pitcher_tot_5 = df_pitcher_tot_5.rename(columns={'팀명':'team0', '이름':'pitcher', '연도':'year'})
df_pitcher_tot_5['team0'] = df_pitcher_tot_5['team0'].astype(float)

df_pitcher_situ_n = df_pitcher_situ_n.rename(columns={'팀명':'team1', '이름':'opponent pitcher'})
df_pitcher_situ_n['team1'] = df_pitcher_situ_n['team1'].astype(float)

df_pitcher_tot_n = df_pitcher_tot_n.rename(columns={'팀명':'team1', '이름':'opponent pitcher', '연도':'year'})
df_pitcher_tot_n['team1'] = df_pitcher_tot_n['team1'].astype(float)

#### 1-4. df_hitter (타자 데이터)

##### 1) 데이터 불러오기

##### 2) 데이터 전처리

### 2. 데이터 병합

#### 2-1. df_midterm + df_entry + df_pithcer
- df_midterm + df_entry_team + df_pitcher_5 + df_entry_opponent + df_pitcher_n

In [48]:
# KIA entry 병합
df = pd.merge(df_midterm, df_entry_team, on = ["date", "team0"], how = "left")

In [49]:
df.head()
#df.to_excel('tempfile.xlsx')

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,player3,player4,player5,player6,player7,player8,player9,pitcher,type,LR ratio
0,1.0,20180324,5,14.0,5,1,1,9.8,0.0,4.0,...,버나디나,최형우,나지완,안치홍,이범호,김민식,김선빈,헥터,우투,0.444444
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,...,김주찬,최형우,나지완,안치홍,이범호,김민식,김선빈,양현종,좌투,0.444444
2,0.0,20180327,5,18.5,5,1,6,14.5,0.0,5.0,...,김주찬,최형우,나지완,안치홍,이범호,김민식,김선빈,팻딘,좌투,0.444444
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,...,안치홍,최형우,나지완,김선빈,이범호,최원준,김민식,이민우,우투,0.555556
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,...,버나디나,최형우,나지완,안치홍,이범호,백용환,김선빈,정용운,좌투,0.333333


In [50]:
# df_pithcer_situ_5&tot_5 병합  (KIA 선발 투수 데이터프레임 병합)
# situation
df = pd.merge(df, df_pitcher_situ_5, on = ['team0', 'pitcher'], how='left')
# total
df = pd.merge(df, df_pitcher_tot_5, on = ['team0', 'pitcher', 'year'], how='left')

In [51]:
df.head()
#df.to_excel('tempfile1.xlsx')

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,WPCT,TBF,IP,H,HR,BB,HBP,SO,R,ER
0,1.0,20180324,5,14.0,5,1,1,9.8,0.0,4.0,...,,,,,,,,,,
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,...,0.542,772.0,184 1/3,199.0,21.0,43.0,2.0,152.0,88.0,85.0
2,0.0,20180327,5,18.5,5,1,6,14.5,0.0,5.0,...,,,,,,,,,,
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,...,0.4,188.0,37 2/3,52.0,4.0,23.0,5.0,28.0,34.0,30.0
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,...,1.0,45.0,10,10.0,1.0,8.0,0.0,5.0,7.0,7.0


In [52]:
# opponent team entry 병합
df = pd.merge(df, df_entry_opponent, on = ["date", "team1"], how = "left")

In [53]:
df.head()
#df.to_excel('tempfile2.xlsx')

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,opponent player3,opponent player4,opponent player5,opponent player6,opponent player7,opponent player8,opponent player9,opponent pitcher,opponent type,opponent LR ratio
0,1.0,20180324,5,14.0,5,1,1,9.8,0.0,4.0,...,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,피어밴드,좌투,0.222222
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,...,로하스,윤석민,황재균,유한준,박경수,강백호,장성우,주권,우투,0.111111
2,0.0,20180327,5,18.5,5,1,6,14.5,0.0,5.0,...,구자욱,러프,강민호,이원석,조동찬,강한울,김헌곤,보니야,우투,0.222222
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,...,구자욱,러프,강민호,이원석,박한이,김헌곤,강한울,양창섭,우투,0.444444
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,...,구자욱,러프,강민호,이원석,박한이,김헌곤,강한울,백정현,좌투,0.444444


In [54]:
# df_pithcer_situ_n&tot_n 병합  (KIA 이외 팀 선발 투수 데이터프레임 병합)
# situation
df = pd.merge(df, df_pitcher_situ_n, on = ['team1', 'opponent pitcher'], how='left')
# total
df = pd.merge(df, df_pitcher_tot_n, on = ['team1', 'opponent pitcher', 'year'], how='left')

In [55]:
df.head()
#df.to_excel('tempfile3.xlsx')

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,opponent WPCT,opponent TBF,opponent IP,opponent H,opponent HR,opponent BB,opponent HBP,opponent SO,opponent R,opponent ER
0,1.0,20180324,5,14.0,5,1,1,9.8,0.0,4.0,...,,,,,,,,,,
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,...,0.25,407.0,88,118.0,16.0,27.0,7.0,54.0,85.0,82.0
2,0.0,20180327,5,18.5,5,1,6,14.5,0.0,5.0,...,,,,,,,,,,
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,...,0.538,400.0,87 1/3,100.0,12.0,34.0,10.0,49.0,52.0,49.0
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,...,0.5,540.0,125 2/3,142.0,18.0,36.0,3.0,99.0,68.0,64.0


In [56]:
# 결측치 제거 (None)
#df.isnull().sum() # 결측치 확인
df = df.dropna(axis = 0, how = 'any')

In [57]:
df.tail()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,opponent WPCT,opponent TBF,opponent IP,opponent H,opponent HR,opponent BB,opponent HBP,opponent SO,opponent R,opponent ER
455,1.0,20201027,5,18.5,5,1,1,14.6,0.0,1.8,...,0.652,906,207 2/3,233,18,68,8,152,105,100
456,0.0,20201028,5,18.5,5,1,1,15.1,0.0,4.0,...,0.556,668,158,152,16,46,9,110,80,72
457,1.0,20201029,5,18.5,5,1,9,12.2,0.0,5.0,...,0.476,622,136 1/3,190,14,39,3,56,91,76
458,0.0,20201030,4,18.5,5,0,7,15.0,0.0,11.9,...,0.429,590,130,167,14,36,6,87,87,78
459,0.0,20201031,5,17.0,5,1,4,13.6,0.0,2.5,...,0.4,196,42,57,10,13,3,26,27,27


#### 2-2. df_hitter

#### 2-3. 확인

In [58]:
attr = df.columns.tolist()
print('Attributes : ', attr)
print('\nAttributes Length : ', len(attr))

Attributes :  ['label', 'date', 'field', 'time', 'team0', 'H/A0', 'team1', '평균기온', '강수량', '평균풍속', '상대습도', 'year', 'player1', 'player2', 'player3', 'player4', 'player5', 'player6', 'player7', 'player8', 'player9', 'pitcher', 'type', 'LR ratio', 'H_x', '2B_x', '3B_x', 'HR_x', 'BB_x', 'HBP_x', 'SO_x', 'WP_x', 'BK_x', 'AVG_x', 'H_y', '2B_y', '3B_y', 'HR_y', 'BB_y', 'HBP_y', 'SO_y', 'WP_y', 'BK_y', 'AVG_y', 'ERA', 'G', 'CG', 'SHO', 'W', 'L', 'SV', 'HLD', 'WPCT', 'TBF', 'IP', 'H', 'HR', 'BB', 'HBP', 'SO', 'R', 'ER', 'opponent player1', 'opponent player2', 'opponent player3', 'opponent player4', 'opponent player5', 'opponent player6', 'opponent player7', 'opponent player8', 'opponent player9', 'opponent pitcher', 'opponent type', 'opponent LR ratio', 'opponent H_x', 'opponent 2B_x', 'opponent 3B_x', 'opponent HR_x', 'opponent BB_x', 'opponent HBP_x', 'opponent SO_x', 'opponent WP_x', 'opponent BK_x', 'opponent AVG_x', 'opponent H_y', 'opponent 2B_y', 'opponent 3B_y', 'opponent HR_y', 'opponen

In [59]:
df.head()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,opponent WPCT,opponent TBF,opponent IP,opponent H,opponent HR,opponent BB,opponent HBP,opponent SO,opponent R,opponent ER
1,0.0,20180325,5,14.0,5,1,1,12.0,0.0,5.0,...,0.25,407,88,118,16,27,7,54,85,82
3,1.0,20180328,5,18.5,5,1,6,15.6,0.0,6.5,...,0.538,400,87 1/3,100,12,34,10,49,52,49
4,0.0,20180329,5,18.5,5,1,6,17.0,0.0,4.3,...,0.5,540,125 2/3,142,18,36,3,99,68,64
6,1.0,20180331,1,17.0,5,0,2,14.0,0.0,6.8,...,0.545,762,170,177,27,79,10,136,119,115
8,0.0,20180403,6,18.5,5,0,3,15.5,0.1,11.2,...,0.636,702,159 1/3,158,16,54,20,133,81,74


### 3. 병합된 데이터 전처리

In [60]:
# 결측치 제거 (None)


In [61]:
# 필요없는 데이터 제거 (axis = 1 : column)

# player 이름 제거
player = []
for i in range(9):
    player.append('player' + str(i+1))
    player.append('opponent player' + str(i+1))
player.append('pitcher')
player.append('opponent pitcher')
    
df = df.drop(player, axis = 1)

# 기타 필요없는 key값들 제거
df = df.drop(['year', 'date', 'team0'], axis = 1)

### 4. 데이터 저장

In [64]:
df.to_excel('merged_database(kia).xlsx')