In [1]:
%matplotlib inline
%load_ext autoreload

%autoreload 2

# pandas 모듈은 흔히 pd라는 약칭으로 사용된다. Global 표준에 가깝다.
import pandas as pd
import time
from IPython.display import HTML
from IPython.display import display

# 커스텀 라이브러리를 import한다.
from pbp_plot import *
from misc import *

# precision 세팅을 한다. 내부 값은 소수점 5자리까지 표시되도록 저장하고, display 시에는 1자리로 표시하도록 한다.
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.1f' % x)
#set_fonts()

  import pandas.util.testing as tm


In [2]:
import datetime
import requests
from bs4 import BeautifulSoup
import time
from dateutil.relativedelta import relativedelta

In [3]:
def get_game_ids(start_date, end_date):
    timetable_url = 'https://sports.news.naver.com/'\
                    'kbaseball/schedule/index.nhn?month='
    
    mon1 = start_date.replace(day=1)
    r = []
    while mon1 <= end_date:
        r.append(mon1)
        mon1 += relativedelta(months=1)
    
    game_ids = []
    
    for d in r:
        month = d.month
        year = d.year
        
        sch_url = timetable_url + f'{month}&year={year}'

        response = requests.get(sch_url)
        soup = BeautifulSoup(response.text, 'lxml')
        response.close()

        buttons = soup.findAll('span',
                               attrs={'class': 'td_btn'})

        for btn in buttons:
            gid = btn.a['href'].split('gameId=')[1]
            gid_date = datetime.date(int(gid[:4]),
                                     int(gid[4:6]),
                                     int(gid[6:8]))
            if start_date <= gid_date <= end_date:
                game_ids.append(gid)
    return game_ids

In [4]:
start_date=datetime.date(2018, 3, 21)
end_date=datetime.date(2018, 4, 2)
start_time = time.time()
game_ids = get_game_ids(start_date, end_date)
print(f'elapsed time: {(time.time() - start_time):.2f} sec')
print(f'len(gids): {len(game_ids)}')

elapsed time: 0.43 sec
len(gids): 41


In [5]:
from tqdm import tqdm, tqdm_notebook, trange
import json

20180717LGWO02018

In [412]:
start_date=datetime.date(2017, 7, 8)
end_date=datetime.date(2017, 7, 8)

start_time1 = time.time()
#########################################
relay_url = 'http://m.sports.naver.com/ajax/baseball/'\
            'gamecenter/kbo/relayText.nhn'
record_url = 'http://m.sports.naver.com/ajax/baseball/'\
            'gamecenter/kbo/record.nhn'

now = datetime.datetime.now().date()

game_ids = get_game_ids(start_date, end_date)
end_time1 = time.time()

skipped = 0
done = 0
start_time2 = time.time()
for gid in tqdm([game_ids[1]]):
    gid_to_date = datetime.date(int(gid[:4]),
                                int(gid[4:6]),
                                int(gid[6:8]))
    if gid_to_date > now:
        skipped += 1
        continue
    params = {
                'gameId': gid,
                'half': '1'
            }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/59.0.3071.115 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'Host': 'm.sports.naver.com',
        'Referer': 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?&gameId='
                   + gid
                   + '&tab=relay'
    }
    
    response = requests.get(relay_url,
                            params=params,
                            headers=headers)
    if response is None:
        skipped += 1
        continue
    
    js = response.json()
    if isinstance(js, str):
        js = json.loads(js)
    response.close()
    last_inning = js['currentInning']

    if last_inning is None:
        skipped += 1
        continue

    txt = {}
    txt['relayList'] = []
    for x in js['relayList']:
        txt['relayList'].append(x)

    txt['homeTeamLineUp'] = js['homeTeamLineUp']
    txt['awayTeamLineUp'] = js['awayTeamLineUp']

    txt['stadium'] = js['schedule']['stadium']
    
    for inn in range(2, last_inning + 1):
        params = {
            'gameId': gid,
            'half': str(inn)
        }

        response = requests.get(relay_url, params=params, headers=headers)
        if response is not None:
            js = response.json()
            if isinstance(js, str):
                js = json.loads(js)
            response.close()
            for x in js['relayList']:
                txt['relayList'].append(x)
        else:
            skipped += 1
            response.close()
            break
end_time2 = time.time()

print(f'schedule get time: {(end_time1 - start_time1):.2f} sec')
print(f'get json time: {(end_time2 - start_time2):.2f} sec')

100%|██████████| 1/1 [00:01<00:00,  1.23s/it]

schedule get time: 0.19 sec
get json time: 1.24 sec





In [411]:
game_ids

['20170708HHLG02017',
 '20170708HTKT02017',
 '20170708SKLT02017',
 '20170708WOSS02017']

In [413]:
gid

'20170708HTKT02017'

In [414]:
#######################
### 라인업 다운로드 ###
#######################

start_time3 = time.time()

lineup_url = 'https://sports.news.naver.com/gameCenter/gameRecord.nhn?category=kbo&gameId='
lurl = lineup_url + gid
lreq = requests.get(lurl)
lsoup = BeautifulSoup(lreq.text, 'lxml')
lreq.close()

scripts = lsoup.find_all('script')
team_names = lsoup.find_all('span', attrs={'class': 't_name_txt'})
away_team_name = team_names[0].contents[0].split(' ')[0]
home_team_name = team_names[1].contents[0].split(' ')[0]
contents = None

for tag in scripts:
    if len(tag.contents) > 0:
        if tag.contents[0].find('DataClass = ') > 0:
            contents = tag.contents[0]
            start = contents.find('DataClass = ') + 36
            end = contents.find('_homeTeam')
            try:
                oldjs = contents[start:end].strip()
                while oldjs[-1] != '}':
                    oldjs = oldjs[:-1]
                while oldjs[0] != '{':
                    oldjs = oldjs[1:]
                cont = json.loads(oldjs)
                break
            except Exception as e:
                os.chdir('../../..')
                print()
                print(game_id)
                print(oldjs)
                print(e)
                
referee = cont.get('etcRecords')[-1]['result'].split(' ')[0]

bbs = cont.get('battersBoxscore')
al = bbs.get('away')
hl = bbs.get('home')

pos_dict = {'중': '중견수', '좌': '좌익수', '우': '우익수', '유': '유격수', '포': '포수', '지': '지명타자',
            '一': '1루수', '二': '2루수', '三': '3루수'}

posnum_dict = {'중': 8, '좌': 7, '우': 9, '유': 6, '포': 2, '지': 0,
            '一': 3, '二': 4, '三': 5}
homes = []
aways = []

for i in range(len(hl)):
    player = hl[i]
    name = player.get('name')
    pos = player.get('pos')[0]
    homes.append({'name': name, 'pos': pos})

for i in range(len(al)):
    player = al[i]
    name = player.get('name')
    pos = player.get('pos')[0]
    aways.append({'name': name, 'pos': pos})

end_time3 = time.time()
print(f'referee time: {(end_time3 - start_time3):.2f} sec')

referee time: 0.05 sec


In [415]:
start_time4 = time.time()

rl = txt['relayList']
tl_keys = ['seqno', 'text', 'type', 'stuff', 'ptsPitchId', 'speed', 'playerChange']
rl_keys = ['no', 'textOptionList', 'inn', 'ptsOptionList',
           'titleStyle', 'homeOrAway', 'title', 'lastSyncIndex']
pts_keys = ['crossPlateX', 'topSz',
            'pitchId', 'vy0', 'vz0', 'vx0',
            'z0', 'ax', 'x0', 'ay', 'az',
            'bottomSz', 'stance']

ts_set = []
stadium = txt['stadium']
for k in range(len(rl)):
    for j in range(len(rl[k].get('textOptionList'))):
        ts = rl[k].get('textOptionList')[j]

        ts_dict = {}
        ts_dict['textOrder'] = rl[k].get('no')
        for key in tl_keys:
            if key == 'playerChange':
                if ts.get(key) is not None:
                    for x in ['outPlayer', 'inPlayer', 'shiftPlayer']:
                        if x in ts.get(key).keys():
                            ts_dict[x] = ts.get(key).get(x).get('playerId')
            else:
                ts_dict[key] = None if key not in ts.keys() else ts.get(key)
        ts_dict['referee'] = referee
        ts_dict['stadium'] = stadium
        ts_set.append(ts_dict)
ts_df = pd.DataFrame(ts_set)
ts_df = ts_df.rename(index=str, columns={'ptsPitchId': 'pitchId'})
ts_df.seqno = pd.to_numeric(ts_df.seqno)

pdata_set = []
pdata_df = None
for k in range(len(rl)):
    if rl[k].get('ptsOptionList') is not None:
        for j in range(len(rl[k].get('ptsOptionList'))):
            pdata = rl[k].get('ptsOptionList')[j]

            pdata_dict = {}
            pdata_dict['textOrder'] = rl[k].get('no')
            for key in pts_keys:
                pdata_dict[key] = None if key not in pdata.keys() else pdata.get(key)
            pdata_set.append(pdata_dict)

if len(pdata_set) > 0:
    pdata_df = pd.DataFrame(pdata_set)
    merge_df = pd.merge(ts_df, pdata_df, how='outer').sort_values(['textOrder', 'seqno'])
else:
    merge_df = ts_df.sort_values(['textOrder', 'seqno'])

### 라인업 가져다와서 더하기 ###
hit_columns = ['name', 'pCode', 'posName', 'pos',
            'hitType', 'seqno', 'batOrder',
            'ab', 'hit', 'run', 'rbi',
            'hr', 'bb', 'so']
pit_columns = ['name', 'pCode', 'hitType', 'seqno',
            'inn', 'run', 'er', 'hit', 'hr',
            'bb', 'kk', 'hbp', 'wp', 'ballCount']

atl = txt.get('awayTeamLineUp')
abat = atl.get('batter')
apit = atl.get('pitcher')
abats = pd.DataFrame(abat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
apits = pd.DataFrame(apit, columns=pit_columns).sort_values('seqno')

htl = txt.get('homeTeamLineUp')
hbat = htl.get('batter')
hpit = htl.get('pitcher')
hbats = pd.DataFrame(hbat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
hpits = pd.DataFrame(hpit, columns=pit_columns).sort_values('seqno')

for a in aways:
    if a.get('pos') == '교':
        continue
    abats.loc[abats.name == a.get('name'), 'posName'] = pos_dict.get(a.get('pos'))
    abats.loc[abats.name == a.get('name'), 'pos'] = posnum_dict.get(a.get('pos'))

for h in homes:
    if h.get('pos') == '교':
        continue
    hbats.loc[hbats.name == h.get('name'), 'posName'] = pos_dict.get(h.get('pos'))
    hbats.loc[hbats.name == h.get('name'), 'pos'] = posnum_dict.get(h.get('pos'))
abats['homeaway'] = 'a'
hbats['homeaway'] = 'h'
apits['homeaway'] = 'a'
hpits['homeaway'] = 'h'
abats['team_name'] = away_team_name
hbats['team_name'] = home_team_name
apits['team_name'] = away_team_name
hpits['team_name'] = home_team_name

bats = pd.concat([abats, hbats])
pits = pd.concat([apits, hpits])
bats.pCode = pd.to_numeric(bats.pCode)
pits.pCode = pd.to_numeric(pits.pCode)

end_time4 = time.time()
print(f'make df time: {(end_time4 - start_time4):.2f} sec')

make df time: 0.05 sec


In [423]:
from new_pbp_parse import game_status
start_time5 = time.time()

gs = game_status()
gs.load(gid, pits, bats, merge_df)
gs.parse_game()
gs.save_game('./')

end_time5 = time.time()
print(f'make df time: {(end_time5 - start_time5):.2f} sec')

make df time: 0.69 sec
