## 데이터 전처리

In [1]:
import pandas as pd
import numpy as np

### 1. 크롤링된 데이터 불러오기 및 전처리

#### 1-1. df_midterm (경기결과 + 날씨)

##### 1) 데이터 불러오기

In [2]:
df_midterm = pd.read_excel('numerized_merge3.xlsx')
df_midterm.tail()

Unnamed: 0,label,ymd,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도
2108,1.0,20201018,1,14.0,2,1,5,0,20.0,0.0,0.0,60.0
2109,0.0,20201018,6,14.0,1,0,3,1,20.0,0.0,0.0,60.0
2110,1.0,20201018,7,14.0,4,1,7,0,20.0,0.0,0.0,60.0
2111,0.0,20201018,2,14.0,9,0,10,1,20.0,0.0,0.0,60.0
2112,0.0,20201018,9,14.0,6,0,8,1,20.0,0.0,0.0,60.0


##### 2) 데이터 전처리

In [3]:
# KIA 이외 경기 제거
idx_num = df_midterm[(df_midterm['team0'] != 5) & (df_midterm['team1'] != 5)].index
df_midterm = df_midterm.drop(idx_num)
df_midterm = df_midterm.reset_index(drop=True) # reorder index

print('Length of dataframe KIA: ' + str(len(df_midterm)))
print('Removed data length : ' + str(len(idx_num)))

Length of dataframe KIA: 422
Removed data length : 1691


In [4]:
# 팀0 에 KIA (5), 팀1 에 상대팀 번호로 데이터 정리

# 팀0가 KIA인 dataframe
df_midterm0 = df_midterm[df_midterm['team0'] == 5].copy()
# 팀1이 KIA인 dataframe
df_midterm1 = df_midterm[df_midterm['team1'] == 5].copy()
# .copy()는 SettingWithCopyWarning을 없애기 위해

df_midterm1.tail()

Unnamed: 0,label,ymd,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도
417,0.0,20201014,7,18.5,4,1,5,0,15.8,0.0,4.7,43.9
418,1.0,20201015,7,18.5,4,1,5,0,15.6,0.0,5.0,38.9
419,0.0,20201016,1,18.5,2,1,5,0,11.8,0.0,5.4,64.3
420,1.0,20201017,1,17.0,2,1,5,0,13.1,0.0,6.5,64.3
421,1.0,20201018,1,14.0,2,1,5,0,20.0,0.0,0.0,60.0


In [5]:
# 이어서

# label 값 변경
df_midterm1.loc[:, 'label'] += 1
df_midterm1.loc[df_midterm1['label'] == 2, 'label'] -= 2

# H/A0, H/A1 값 변경
df_midterm1.loc[:, 'H/A0'] += 1
df_midterm1.loc[df_midterm1['H/A0'] == 2, 'H/A0'] -= 2

df_midterm1.loc[:, 'H/A1'] += 1
df_midterm1.loc[df_midterm1['H/A1'] == 2, 'H/A1'] -= 2

# 팀0, 팀1 변경
df_midterm1.loc[:, 'team1'] = df_midterm1.loc[:, 'team0']
df_midterm1.loc[:, 'team0'] = 5

# df_midterm = df_midterm0 + df_midterm1
df_midterm = df_midterm0.append(df_midterm1)

df_midterm1.tail()

Unnamed: 0,label,ymd,field,time,team0,H/A0,team1,H/A1,평균기온,강수량,평균풍속,상대습도
417,1.0,20201014,7,18.5,5,0,4,1,15.8,0.0,4.7,43.9
418,0.0,20201015,7,18.5,5,0,4,1,15.6,0.0,5.0,38.9
419,1.0,20201016,1,18.5,5,0,2,1,11.8,0.0,5.4,64.3
420,0.0,20201017,1,17.0,5,0,2,1,13.1,0.0,6.5,64.3
421,0.0,20201018,1,14.0,5,0,2,1,20.0,0.0,0.0,60.0


In [6]:
# 데이터 정리
# H/A1 : H/A0와 대칭되는 값 (ex. H/A0가 0이면 H/A1이 1)
df_midterm = df_midterm.drop(['H/A1'], axis = 1)
df_midterm = df_midterm.sort_index() # index 오름차순으로 정렬

df_midterm.rename(columns = {'ymd':'date'}, inplace = True)

df_midterm.tail()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,상대습도
417,1.0,20201014,7,18.5,5,0,4,15.8,0.0,4.7,43.9
418,0.0,20201015,7,18.5,5,0,4,15.6,0.0,5.0,38.9
419,1.0,20201016,1,18.5,5,0,2,11.8,0.0,5.4,64.3
420,0.0,20201017,1,17.0,5,0,2,13.1,0.0,6.5,64.3
421,0.0,20201018,1,14.0,5,0,2,20.0,0.0,0.0,60.0


#### 1-2. df_entry (선발라인업)

#####  1) 데이터 불러오기

In [7]:
df_entry = pd.read_excel('player_entry.xlsx')
df_entry.tail()

Unnamed: 0.1,Unnamed: 0,team,date,player1,position1,type1,player2,position2,type2,player3,...,type8,player9,position9,type9,pitcher,position,type,result,opponent_p,type.1
4459,4459,18 kt,2018-03-29,오태곤,LF,R,강백호,DH,L,로하스,...,R,정현,SS,R,류희운,P,우투,kt 7:1 SK,문승원,우투
4460,4460,18 kt,2018-03-28,이진영,DH,L,강백호,LF,L,로하스,...,R,정현,SS,R,금민철,P,좌투,kt 8:5 SK,박종훈,우언
4461,4461,18 kt,2018-03-27,심우준,SS,R,박경수,2B,R,로하스,...,L,장성우,C,R,고영표,P,우언,kt 5:8 SK,산체스,우투
4462,4462,18 kt,2018-03-25,정현,SS,R,오정복,DH,R,로하스,...,L,장성우,C,R,주권,P,우투,kt 1:14 KIA,양현종,좌투
4463,4463,18 kt,2018-03-24,심우준,SS,R,이진영,DH,L,로하스,...,L,장성우,C,R,피어밴드,P,좌투,kt 5:4 KIA,헥터,우투


##### 2) 데이터 전처리

In [8]:
df_entry = df_entry.iloc[:,1:-3] # opponent pitcher, index(unnamed:0), result 제거

In [9]:
# dummy data 제거 + 정규경기 이외 경기 제거
# 특징 : pitcher~type column 값이 모두 같은 값을 가짐
idx_num = df_entry[df_entry['pitcher'] == df_entry['position']].index
df_entry = df_entry.drop(idx_num)

print('Length after removing dummy : ' + str(len(df_entry)))
print('Removed data length : ' + str(len(idx_num)))

Length after removing dummy : 4323
Removed data length : 141


In [10]:
# pitcher postion 값 제거 (p로 고정)
df_entry = df_entry.drop('position', axis = 1)

In [11]:
# date 값 변경 (df_midterm의 ymd 형태로)
df_entry['date'] = df_entry['date'].str.replace("-","").astype(int)

In [12]:
# team 이름 변경 (int 형태로)

# df_entry의 team_name 확인
team_list = df_entry['team'].unique().tolist()
print('team_list : ' + str(team_list) + '   , len(' + str(len(team_list)) + ')')

# team_name이 key, team_num이 value인 dictionary  dict_team 생성
# 1~10 : KT, LG, SK, NC, KIA, 삼성, 롯데, 한화, 두산, 키움
# team_name = [KIA, 삼성, 롯데, 두산, Sk, LG, 한화, NC, 키움, KT]
team_num = [5, 6, 7, 9, 3, 2, 8, 4, 10, 1] # team_name 과 순서 동일
dict_team = {}
for i in range(10):
    for idx in range(3):
        idx = i * 3 + idx
        dict_team[team_list[idx]] = team_num[i]

# df_entry의 team이름 변환
for i in team_list:
    df_entry.loc[df_entry['team'] == i, 'team'] = dict_team[i]

team_list : ['20 KIA', '19 KIA', '18 KIA', '20 삼성', '19 삼성', '18 삼성', '20 롯데', '19 롯데', '18 롯데', '20 두산', '19 두산', '18 두산', '20 SK', '19 SK', '18 SK', '20 LG', '19 LG', '18 LG', '20 한화', '19 한화', '18 한화', '20 NC', '19 NC', '18 NC', '20 키움', '19 키움', '18 넥센', '20 KT', '19 KT', '18 kt']   , len(30)


In [13]:
# 팀 별로 dataframe 분할
# df_entry_team : kia 경기 entry
df_entry_team = df_entry[df_entry['team'] == 5].copy()
df_entry_team.rename(columns = {'team' : 'team0'}, inplace = True)

# df_entry_opponent : kia 를 제외한 경기 entry
df_entry_opponent = df_entry[df_entry['team'] != 5].copy()
entry_opponent_rename_dict = {}
for i in range(10):
    if i != 9:
        entry_opponent_rename_dict['player' + str(i+1)] = 'opponent player' + str(i+1)
        entry_opponent_rename_dict['type' + str(i+1)] = 'opponent type' + str(i+1)
        entry_opponent_rename_dict['position' + str(i+1)] = 'opponent position' + str(i+1)
    else:
        entry_opponent_rename_dict['pitcher'] = 'opponent pitcher'
        entry_opponent_rename_dict['type'] = 'opponent type'
        
df_entry_opponent.rename(columns = entry_opponent_rename_dict, inplace = True)
df_entry_opponent.rename(columns = {'team' : 'team1'}, inplace = True)

In [14]:
df_entry_opponent.tail() # check df_entry_opponent

Unnamed: 0,team1,date,opponent player1,opponent position1,opponent type1,opponent player2,opponent position2,opponent type2,opponent player3,opponent position3,...,opponent position7,opponent type7,opponent player8,opponent position8,opponent type8,opponent player9,opponent position9,opponent type9,opponent pitcher,opponent type
4459,1,20180329,오태곤,LF,R,강백호,DH,L,로하스,CF,...,2B,R,장성우,C,R,정현,SS,R,류희운,우투
4460,1,20180328,이진영,DH,L,강백호,LF,L,로하스,CF,...,2B,R,장성우,C,R,정현,SS,R,금민철,좌투
4461,1,20180327,심우준,SS,R,박경수,2B,R,로하스,CF,...,LF,R,강백호,DH,L,장성우,C,R,고영표,우언
4462,1,20180325,정현,SS,R,오정복,DH,R,로하스,CF,...,2B,R,강백호,LF,L,장성우,C,R,주권,우투
4463,1,20180324,심우준,SS,R,이진영,DH,L,로하스,CF,...,2B,R,강백호,LF,L,장성우,C,R,피어밴드,좌투


#### 1-3. df_pitcher (투수 데이터)

##### 1) 데이터 불러오기

In [15]:
df_pitcher_situ = pd.read_excel('pitchers_situation.xlsx')
df_pitcher_situ.tail()

Unnamed: 0,팀명,이름,구분,H,2B,3B,HR,BB,HBP,SO,WP,BK,AVG
923,SK,백인식,우타자,2,0,0,1,0,0,0,0,0,0.500
924,SK,최민준,좌타자,-,-,-,-,-,-,-,-,-,-
925,SK,최민준,우타자,-,-,-,-,-,-,-,-,-,-
926,SK,조성훈,좌타자,-,-,-,-,-,-,-,-,-,-
927,SK,조성훈,우타자,-,-,-,-,-,-,-,-,-,-


In [16]:
df_pitcher_tot = pd.read_excel('pitchers_total.xlsx')
df_pitcher_tot.tail()

Unnamed: 0,팀명,이름,연도,ERA,G,CG,SHO,W,L,SV,...,WPCT,TBF,IP,H,HR,BB,HBP,SO,R,ER
1387,SK,최민준,2019,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1388,SK,최민준,2020,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1389,SK,조성훈,2018,27.00,1,0,0,0,0,0,...,-,4,2/3,3,0,0,0,0,2,2
1390,SK,조성훈,2019,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1391,SK,조성훈,2020,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


##### 2) 데이터 전처리

In [24]:
# 선발 투수 제외한 데이터 제거
df_start_pitcher_list = (df_entry_team['pitcher'].unique().tolist()
                         + df_entry_opponent['opponent pitcher'].unique().tolist())



RangeIndex(start=0, stop=928, step=1)


In [18]:
dict_team_num = {'KT':'1', 'LG':'2', 'SK':'3', 'NC':'4', 'KIA':'5', '삼성':'6',
                 '롯데':'7', '한화':'8', '두산':'9', '넥센':'10', '키움':'10', '드림':'11', '나눔':'12'}

# 1. 선발 투수 리스트 #
df_temp = df_entry_opponent[['team1','opponent pitcher']]

for team in dict_team_num.keys():
    df_pitcher_situ['팀명'] = df_pitcher_situ['팀명'].replace(team, dict_team_num[team])
    df_pitcher_tot['팀명'] = df_pitcher_tot['팀명'].replace(team, dict_team_num[team])
###################
    
    
# 2. 선발 투수들에 대한 정보만 따로 추리기 #
df_pitcher_situ['임시'] = df_pitcher_situ['팀명'] + df_pitcher_situ['이름']
df_pitcher_tot['임시'] = df_pitcher_tot['팀명'] + df_pitcher_tot['이름']

temp_list = []
for i,j in zip(df_temp['team1'], df_temp['opponent pitcher']):
    temp_list.append(str(i)+j)
temp_list = np.unique(np.array(temp_list))
    
df_pitcher_situ = df_pitcher_situ[df_pitcher_situ['임시'].isin(list(temp_list))]
df_pitcher_tot = df_pitcher_tot[df_pitcher_tot['임시'].isin(list(temp_list))]
###############################


# 3. replace from '-' to nan #
df_pitcher_situ = df_pitcher_situ.replace('-', np.nan)
df_pitcher_tot = df_pitcher_tot.replace('-', np.nan)
##############################


# # 4. '임시' 컬럼 제거 #
# df_pitcher_situ = df_pitcher_situ.drop('임시', axis=1)
# df_pitcher_tot = df_pitcher_tot.drop('임시', axis=1)
# ###################


df_pitcher_situ.tail()
#df_pitcher_tot.tail()


Unnamed: 0,팀명,이름,구분,H,2B,3B,HR,BB,HBP,SO,WP,BK,AVG,임시
897,3,소사,우타자,,,,,,,,,,,3소사
900,3,이승진,좌타자,27.0,5.0,1.0,1.0,13.0,0.0,28.0,0.0,0.0,0.284,3이승진
901,3,이승진,우타자,28.0,7.0,1.0,2.0,9.0,2.0,26.0,0.0,0.0,0.267,3이승진
912,3,켈리,좌타자,,,,,,,,,,,3켈리
913,3,켈리,우타자,,,,,,,,,,,3켈리


#### 1-4. df_hitter (타자 데이터)

##### 1) 데이터 불러오기

##### 2) 데이터 전처리

### 2. 데이터 병합
 - pd.merge(df1, df2, on = 'key', how = 'right, left, outer, inner') / key : 기준 column

#### 2-1. df_midterm + df_entry

In [19]:
# KIA entry 병합
df = pd.merge(df_midterm, df_entry_team, on = ["date", "team0"], how = "right")

In [20]:
# opponent team entry 병합
df = pd.merge(df, df_entry_opponent, on = ["date", "team1"], how = "right")

In [21]:
# 결측치 제거 (None)
df.isnull().sum() # 결측치 확인
#df = df.dropna(axis = 0, how = 'any')

label                 3468
date                     0
field                 3468
time                  3468
team0                 3468
                      ... 
opponent player9         0
opponent position9       0
opponent type9           0
opponent pitcher         0
opponent type            0
Length: 69, dtype: int64

#### 2-2. df_pitcher

In [22]:
# 좌타, 우타 컬럼 추가 (좌타는 columnName_x, 우타는 columnName_y로 표시)
df = pd.merge(df, df_pitcher_situ.iloc[:,3:-1], how = "outer", left_index = True, right_index = True)
df = pd.merge(df, df_pitcher_situ.iloc[:,3:-1], how = "outer", left_index = True, right_index = True)

# 통산데이터 컬럼 추가
df = pd.merge(df, df_pitcher_tot.iloc[:,3:-1], how = "outer", left_index = True, right_index = True)
df.tail()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,WPCT,TBF,IP,H,HR,BB,HBP,SO,R,ER
3913,,20180331,,,,,1.0,,,,...,,,,,,,,,,
3914,,20180330,,,,,1.0,,,,...,,,,,,,,,,
3915,,20180329,,,,,1.0,,,,...,,,,,,,,,,
3916,,20180328,,,,,1.0,,,,...,,,,,,,,,,
3917,,20180327,,,,,1.0,,,,...,,,,,,,,,,


In [23]:
temp0_list = ['H', '2B', '3B', 'HR', 'BB', 'HBP', 'SO', 'WP', 'BK', 'AVG']
temp1_list = ['H_x', '2B_x', '3B_x', 'HR_x', 'BB_x', 'HBP_x', 'SO_x', 'WP_x', 'BK_x', 'AVG_x']
temp2_list = ['H_y', '2B_y', '3B_y', 'HR_y', 'BB_y', 'HBP_y', 'SO_y', 'WP_y', 'BK_y', 'AVG_y']
temp3_list = ['ERA', 'G', 'CG', 'SHO', 'W', 'L', 'SV', 'HLD', 'WPCT', 'TBF', 'IP', 'H', 'HR', 'BB', 'HBP', 'SO', 'R', 'ER']

temp_array = np.unique(np.array(df_pitcher_situ['임시']))

for idx in range(len(df)):
    row = df.iloc[idx]
    year = int(str(row['date'])[:4])
    team = str(row['team1'])
    name = row['opponent pitcher']
    
    flag = team+name
    if not (flag in temp_array):
        print(flag)
        continue
    
    # 상황별 기록 저장
    new_vals = df_pitcher_situ[(df_pitcher_situ['팀명']==team) & (df_pitcher_situ['이름']==name)].copy()
    # print(df.iloc[idx])
    # print(df.iloc[idx][temp1_list[i]])
    # print('!!!!!!')
    # print(new_vals[new_vals['구분']=='좌타자'][temp0_list[i]].iloc[0])
    # df.loc[idx, temp1_list[i]] = new_vals[new_vals['구분']=='좌타자'][temp0_list[i]].iloc[0]
    # print(df.loc[idx, temp1_list[i]])
    
    for i in range(len(temp0_list)):
        df.loc[idx, temp1_list[i]] = new_vals[new_vals['구분']=='좌타자'][temp0_list[i]].iloc[0]
        df.loc[idx, temp2_list[i]] = new_vals[new_vals['구분']=='우타자'][temp0_list[i]].iloc[0]

    # 통산 기록 저장
    new_vals = df_pitcher_tot[(df_pitcher_tot['팀명']==team) & (df_pitcher_tot['이름']==name)].copy()

    for i in range(len(temp3_list)):
        df.loc[idx, temp3_list[i]] = new_vals[new_vals['연도']==year][temp3_list[i]].iloc[0]


1.0피어밴드
1.0주권
6.0보니야
6.0양창섭
6.0백정현
2.0윌슨
2.0차우찬
2.0김대현
3.0박종훈
3.0문승원
10.0최원태
10.0브리검
10.0신재영
8.0김재영
8.0윤규진
8.0샘슨
7.0윤성빈
2.0김대현
2.0윌슨
2.0차우찬
9.0장원준
9.0후랭코프
9.0유희관
8.0샘슨
8.0휠러
1.0금민철
1.0주권
1.0니퍼트
7.0듀브론트
7.0박시영
7.0레일리
4.0정수민
4.0김정훈
4.0베렛
9.0현도훈
9.0후랭코프
9.0이영하
6.0장원삼
6.0김대우
10.0브리검
10.0신재영
10.0로저스
3.0박종훈
3.0김광현
3.0켈리
1.0주권
1.0니퍼트
1.0고영표
4.0최성영
4.0이재학
4.0구창모
10.0로저스
10.0한현희
10.0최원태
9.0린드블럼
9.0후랭코프
9.0이영하
1.0고영표
1.0피어밴드
1.0금민철
7.0듀브론트
7.0박세웅
3.0김광현
3.0켈리
3.0산체스
2.0윌슨
2.0소사
2.0김영준
4.0베렛
4.0최금강
10.0한현희
10.0최원태
10.0브리검
3.0산체스
9.0유희관
9.0린드블럼
8.0김민우
8.0윤규진
8.0샘슨
2.0차우찬
2.0김대현
2.0윌슨
4.0구창모
4.0왕웨이중
4.0이재학
6.0보니야
6.0양창섭
6.0백정현
1.0금민철
1.0김사율
1.0피어밴드
8.0헤일
8.0윤규진
8.0샘슨
6.0윤성환
6.0보니야
6.0양창섭
7.0김원중
7.0박세웅
7.0레일리
9.0후랭코프
9.0린드블럼
10.0한현희
10.0최원태
7.0노경은
3.0박종훈
3.0산체스
2.0김대현
2.0배재준
7.0듀브론트
9.0린드블럼
9.0유희관
10.0한현희
10.0하영민
6.0양창섭
6.0백정현
4.0이재학
4.0왕웨이중
7.0김원중
3.0켈리
3.0산체스
6.0보니야
6.0윤성환
4.0베렛
4.0이재학
8.0헤일
8.0김진영
1.0금민철
1.0김민
2.0차우찬
2.0윌슨
8.0김민우
8.0김성훈
4.0박진우
6.0양창섭
3.0김광현
3.0박종훈
3.0박종훈
3.0박종훈
3.0박종훈
3.0문승원
3.0문

9.0린드블럼
9.0이영하
9.0이용찬
9.0후랭코프
9.0유희관
9.0이영하
9.0이용찬
9.0유희관
9.0린드블럼
9.0후랭코프
9.0이영하
9.0이용찬
9.0유희관
9.0린드블럼
9.0후랭코프
9.0장원준
9.0이용찬
9.0유희관
9.0린드블럼
9.0후랭코프
9.0유희관
9.0이용찬
9.0후랭코프
9.0장원준
9.0린드블럼
9.0유희관
9.0이용찬
9.0후랭코프
9.0이용찬
9.0후랭코프
9.0장원준
9.0린드블럼
9.0유희관
9.0이용찬
9.0후랭코프
9.0장원준
9.0린드블럼
9.0유희관
9.0이용찬
9.0후랭코프
9.0장원준
9.0린드블럼
9.0유희관
9.0이용찬
9.0이영하
9.0후랭코프
9.0린드블럼
9.0유희관
9.0이용찬
9.0유희관
9.0이용찬
9.0후랭코프
9.0린드블럼
9.0유희관
9.0이용찬
9.0이영하
9.0후랭코프
9.0린드블럼
9.0장원준
9.0이용찬
9.0이영하
9.0후랭코프
9.0린드블럼
9.0장원준
9.0린드블럼
9.0장원준
9.0유희관
9.0후랭코프
9.0린드블럼
9.0이영하
9.0유희관
9.0후랭코프
9.0장원준
9.0린드블럼
9.0이영하
9.0린드블럼
9.0유재유
9.0유희관
9.0후랭코프
9.0장원준
9.0린드블럼
9.0이용찬
9.0유희관
9.0후랭코프
9.0장원준
9.0린드블럼
9.0이용찬
9.0유희관
9.0후랭코프
9.0장원준
9.0린드블럼
9.0이용찬
9.0유희관
9.0후랭코프
9.0장원준
9.0린드블럼
3.0윤희상
3.0핀토
3.0이건욱
3.0정수민
3.0박종훈
3.0조영우
3.0핀토
3.0이건욱
3.0정수민
3.0박종훈
3.0조영우
3.0핀토
3.0조영우
3.0핀토
3.0이건욱
3.0문승원
3.0박종훈
3.0조영우
3.0핀토
3.0이건욱
3.0문승원
3.0박종훈
3.0조영우
3.0핀토
3.0이건욱
3.0문승원
3.0박종훈
3.0조영우
3.0핀토
3.0이건욱
3.0문승원
3.0핀토
3.0이건욱
3.0문승원
3.0박종훈
3.0백승건
3.0핀토
3.0오원석
3.0이건욱
3.0박종훈
3.0박종훈
3.0문승원
3.0핀

4.0루친스키
4.0송명기
4.0김영규
4.0라이트
4.0루친스키
4.0송명기
4.0라이트
4.0김영규
4.0루친스키
4.0김영규
4.0루친스키
4.0박정수
4.0최성영
4.0라이트
4.0송명기
4.0김영규
4.0루친스키
4.0박정수
4.0이재학
4.0라이트
4.0송명기
4.0김영규
4.0루친스키
4.0이재학
4.0라이트
4.0송명기
4.0최성영
4.0루친스키
4.0김영규
4.0박진우
4.0라이트
4.0송명기
4.0이재학
4.0루친스키
4.0김영규
4.0라이트
4.0루친스키
4.0김영규
4.0라이트
4.0송명기
4.0신민혁
4.0루친스키
4.0김영규
4.0송명기
4.0라이트
4.0신민혁
4.0루친스키
4.0김진호
4.0송명기
4.0라이트
4.0신민혁
4.0루친스키
4.0최성영
4.0신민혁
4.0루친스키
4.0최성영
4.0이재학
4.0라이트
4.0신민혁
4.0루친스키
4.0루친스키
4.0이재학
4.0최성영
4.0라이트
4.0루친스키
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0구창모
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0라이트
4.0구창모
4.0루친스키
4.0김진호
4.0이재학
4.0구창모
4.0김진호
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0김영규
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0김영규
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0김영규
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0김영규
4.0이재학
4.0구창모
4.0라이트
4.0루친스키
4.0최성영
4.0루친스키
4.0김영규
4.0구창모
4.0프리드릭
4.0최성영
4.0이재학
4.0루친스키
4.0프리드릭
4.0구창모
4.

In [24]:
print(df)

      label      date  field  time  team0  H/A0  team1  평균기온  강수량  평균풍속  ...  \
0       1.0  20180324    5.0  14.0    5.0   1.0    1.0   9.8  0.0   4.0  ...   
1       0.0  20180325    5.0  14.0    5.0   1.0    1.0  12.0  0.0   5.0  ...   
2       0.0  20180327    5.0  18.5    5.0   1.0    6.0  14.5  0.0   5.0  ...   
3       1.0  20180328    5.0  18.5    5.0   1.0    6.0  15.6  0.0   6.5  ...   
4       0.0  20180329    5.0  18.5    5.0   1.0    6.0  17.0  0.0   4.3  ...   
...     ...       ...    ...   ...    ...   ...    ...   ...  ...   ...  ...   
3913    NaN  20180331    NaN   NaN    NaN   NaN    1.0   NaN  NaN   NaN  ...   
3914    NaN  20180330    NaN   NaN    NaN   NaN    1.0   NaN  NaN   NaN  ...   
3915    NaN  20180329    NaN   NaN    NaN   NaN    1.0   NaN  NaN   NaN  ...   
3916    NaN  20180328    NaN   NaN    NaN   NaN    1.0   NaN  NaN   NaN  ...   
3917    NaN  20180327    NaN   NaN    NaN   NaN    1.0   NaN  NaN   NaN  ...   

      WPCT  TBF   IP    H   HR   BB  HB

#### 2-3. df_hitter

#### 2-4. 확인

In [25]:
attr = df.columns.tolist()
print('Attributes : ', attr)
print('\nAttributes Length : ', len(attr))

Attributes :  ['label', 'date', 'field', 'time', 'team0', 'H/A0', 'team1', '평균기온', '강수량', '평균풍속', '상대습도', 'player1', 'position1', 'type1', 'player2', 'position2', 'type2', 'player3', 'position3', 'type3', 'player4', 'position4', 'type4', 'player5', 'position5', 'type5', 'player6', 'position6', 'type6', 'player7', 'position7', 'type7', 'player8', 'position8', 'type8', 'player9', 'position9', 'type9', 'pitcher', 'type', 'opponent player1', 'opponent position1', 'opponent type1', 'opponent player2', 'opponent position2', 'opponent type2', 'opponent player3', 'opponent position3', 'opponent type3', 'opponent player4', 'opponent position4', 'opponent type4', 'opponent player5', 'opponent position5', 'opponent type5', 'opponent player6', 'opponent position6', 'opponent type6', 'opponent player7', 'opponent position7', 'opponent type7', 'opponent player8', 'opponent position8', 'opponent type8', 'opponent player9', 'opponent position9', 'opponent type9', 'opponent pitcher', 'opponent type', '

In [26]:
df.head()

Unnamed: 0,label,date,field,time,team0,H/A0,team1,평균기온,강수량,평균풍속,...,WPCT,TBF,IP,H,HR,BB,HBP,SO,R,ER
0,1.0,20180324,5.0,14.0,5.0,1.0,1.0,9.8,0.0,4.0,...,,,,,,,,,,
1,0.0,20180325,5.0,14.0,5.0,1.0,1.0,12.0,0.0,5.0,...,,,,,,,,,,
2,0.0,20180327,5.0,18.5,5.0,1.0,6.0,14.5,0.0,5.0,...,,,,,,,,,,
3,1.0,20180328,5.0,18.5,5.0,1.0,6.0,15.6,0.0,6.5,...,,,,,,,,,,
4,0.0,20180329,5.0,18.5,5.0,1.0,6.0,17.0,0.0,4.3,...,,,,,,,,,,


### 3. 병합된 데이터 전처리

In [25]:
# 결측치 제거 (None)


In [27]:
# 필요없는 데이터 제거 (axis = 1 : column)
df_midterm = df_midterm.drop(['team0'], axis = 1)

In [28]:
# hitter, pitcher 수치값 평균

### 4. 데이터 저장

In [29]:
df.to_excel('kia_database.xlsx')