##### Q1. temp.xlsx 파일의 data length 가 1년치 (720)
##### Q2. ~

## 데이터 전처리

In [1]:
import pandas as pd
import numpy as np

### 1. 크롤링된 데이터 불러오기 및 전처리

#### 1-1. df_midterm (경기결과 + 날씨)

##### 1) 데이터 불러오기

In [2]:
df_midterm = pd.read_excel('temp.xlsx')
df_midterm.tail()

Unnamed: 0,label,ymd,filed,time,팀0,H/A0,팀1,H/A1,woba0,woba1,평균기온,강수량,평균풍속,상대습도
714,0,20190929,9,14.0,3,0,8,1,0.323111,0.29875,22.7,0.0,2.9,72.3
715,0,20190930,1,18.5,2,1,7,0,0.331,0.317125,22.4,0.0,5.8,61.0
716,0,20190930,9,18.5,3,0,8,1,0.315333,0.2975,22.1,0.0,3.2,76.6
717,1,20191001,1,18.5,4,0,9,1,0.36175,0.374222,23.1,0.0,6.5,67.1
718,1,20191001,4,18.5,7,1,10,0,0.312625,0.316444,22.4,9.8,6.5,95.3


In [3]:
df_midterm.dtypes

label      int64
ymd        int64
filed      int64
time     float64
팀0         int64
H/A0       int64
팀1         int64
H/A1       int64
woba0    float64
woba1    float64
평균기온     float64
강수량      float64
평균풍속     float64
상대습도     float64
dtype: object

##### 2) 데이터 전처리

In [5]:
# KIA 이외 경기 제거
idx_num = df_midterm[(df_midterm['팀0'] != 5) & (df_midterm['팀1'] != 5)].index
df_midterm = df_midterm.drop(idx_num)

print('Length after only KIA : ' + str(len(df_midterm)))
print('Removed data length : ' + str(len(idx_num)))

Length after only KIA : 144
Removed data length : 0


In [6]:
# 팀0 에 KIA (5), 팀1 에 상대팀 번호로 데이터 정리


#### 1-2. df_entry (선발라인업)

#####  1) 데이터 불러오기

In [7]:
df_entry = pd.read_excel('player_entry.xlsx')
df_entry.tail()

Unnamed: 0.1,Unnamed: 0,team,date,player1,position1,type1,player2,position2,type2,player3,...,type8,player9,position9,type9,pitcher,position,type,result,opponent_p,type.1
4459,4459,18 kt,2018-03-29,오태곤,LF,R,강백호,DH,L,로하스,...,R,정현,SS,R,류희운,P,우투,kt 7:1 SK,문승원,우투
4460,4460,18 kt,2018-03-28,이진영,DH,L,강백호,LF,L,로하스,...,R,정현,SS,R,금민철,P,좌투,kt 8:5 SK,박종훈,우언
4461,4461,18 kt,2018-03-27,심우준,SS,R,박경수,2B,R,로하스,...,L,장성우,C,R,고영표,P,우언,kt 5:8 SK,산체스,우투
4462,4462,18 kt,2018-03-25,정현,SS,R,오정복,DH,R,로하스,...,L,장성우,C,R,주권,P,우투,kt 1:14 KIA,양현종,좌투
4463,4463,18 kt,2018-03-24,심우준,SS,R,이진영,DH,L,로하스,...,L,장성우,C,R,피어밴드,P,좌투,kt 5:4 KIA,헥터,우투


##### 2) 데이터 전처리

In [8]:
df_entry = df_entry.iloc[:,1:-2] # opponent pitcher 관련 값 제거 + index 값 제거

In [9]:
# dummy data 제거 + 정규경기 이외 경기 제거
# 특징 : pitcher~type column 값이 모두 같은 값을 가짐
idx_num = df_entry[df_entry['pitcher'] == df_entry['position']].index
df_entry = df_entry.drop(idx_num)
df_entry = df_entry.reset_index(drop=True) # reorder index

print('Length after removing dummy : ' + str(len(df_entry)))
print('Removed data length : ' + str(len(idx_num)))

Length after removing dummy : 4323
Removed data length : 141


In [10]:
# date 값 변경 (df_midterm의 ymd 형태로)
df_entry['date'] = df_entry['date'].str.replace("-","").astype(int)

In [11]:
# team 이름 변경 (int 형태로)

# df_entry의 team_name 확인
team_list = df_entry['team'].unique().tolist()
print('team_list : ' + str(team_list) + '   , len(' + str(len(team_list)) + ')')

# team_name이 key, team_num이 value인 dictionary  dict_team 생성
# team_name = [KIA, 삼성, 롯데, 두산, Sk, LG, 한화, NC, 키움, KT]
team_num = [5, 6, 7, 9, 3, 2, 8, 4, 10, 1] # team_name 과 순서 동일
dict_team = {}
for i in range(10):
    for idx in range(3):
        idx = i * 3 + idx
        dict_team[team_list[idx]] = team_num[i]

# df_entry의 team이름 변환
for i in team_list:
    df_entry.loc[df_entry['team'] == i, 'team'] = dict_team[i]

team_list : ['20 KIA', '19 KIA', '18 KIA', '20 삼성', '19 삼성', '18 삼성', '20 롯데', '19 롯데', '18 롯데', '20 두산', '19 두산', '18 두산', '20 SK', '19 SK', '18 SK', '20 LG', '19 LG', '18 LG', '20 한화', '19 한화', '18 한화', '20 NC', '19 NC', '18 NC', '20 키움', '19 키움', '18 넥센', '20 KT', '19 KT', '18 kt']   , len(30)


In [12]:
# opponent team 값 추출


In [13]:
df_entry.tail()

Unnamed: 0,team,date,player1,position1,type1,player2,position2,type2,player3,position3,...,player8,position8,type8,player9,position9,type9,pitcher,position,type,result
4318,1,20180329,오태곤,LF,R,강백호,DH,L,로하스,CF,...,장성우,C,R,정현,SS,R,류희운,P,우투,kt 7:1 SK
4319,1,20180328,이진영,DH,L,강백호,LF,L,로하스,CF,...,장성우,C,R,정현,SS,R,금민철,P,좌투,kt 8:5 SK
4320,1,20180327,심우준,SS,R,박경수,2B,R,로하스,CF,...,강백호,DH,L,장성우,C,R,고영표,P,우언,kt 5:8 SK
4321,1,20180325,정현,SS,R,오정복,DH,R,로하스,CF,...,강백호,LF,L,장성우,C,R,주권,P,우투,kt 1:14 KIA
4322,1,20180324,심우준,SS,R,이진영,DH,L,로하스,CF,...,강백호,LF,L,장성우,C,R,피어밴드,P,좌투,kt 5:4 KIA


#### 1-3. df_pitcher (투수 데이터)

##### 1) 데이터 불러오기

##### 2) 데이터 전처리

#### 1-4. df_hitter (타자 데이터)

##### 1) 데이터 불러오기

##### 2) 데이터 전처리

### 2. 데이터 병합

#### 2-1. df_midterm + df_entry

In [None]:
# pd.merge(df1, df2, on = "key") / key = 공통 columns

### 3. 병합된 데이터 전처리

In [None]:
# hitter, pitcher 수치값 평균

In [None]:
# 병합 후 필요없는 데이터 제거

### 4. 데이터 저장