In [None]:
%matplotlib inline
%load_ext autoreload

%autoreload 2

# pandas 모듈은 흔히 pd라는 약칭으로 사용된다. Global 표준에 가깝다.
import pandas as pd
import time
import json
from IPython.display import HTML
from IPython.display import display

# 커스텀 라이브러리를 import한다.
from pbp_plot import *
from misc import *

# precision 세팅을 한다. 내부 값은 소수점 5자리까지 표시되도록 저장하고, display 시에는 1자리로 표시하도록 한다.
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.1f' % x)

set_fonts()

In [None]:
import requests
import regex
from bs4 import BeautifulSoup
import sys

In [3]:
from utils import *

In [4]:
from pbp_download import *

`JSON` 파일은 모바일 페이지에서 긁어온 정보를 취사선택, 취합한 것임.

# pbp_download(new)

In [5]:
def download_relay(args, lm=None):
    # return True or False
    relay_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/relayText.nhn'
    record_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/record.nhn'

    now = datetime.datetime.now()
    today_year = now.year
    today_date = int(now.date().strftime('%m%d'))
    
    game_ids = get_game_ids(args)
    if (game_ids is None) or (len(game_ids) == 0):
        print('no game ids')
        print('args: {}'.format(args))
        if lm is not None:
            lm.log('no game ids')
            lm.log('args: {}'.format(args))
        return False

    if lm is not None:
        lm.resetLogHandler()
        lm.setLogPath(os.getcwd())
        lm.setLogFileName('relay_download_log.txt')
        lm.cleanLog()
        lm.createLogHandler()
        lm.log('---- Relay Text Download Log ----')

    if not os.path.isdir('pbp_data'):
        os.mkdir('pbp_data')
    os.chdir('pbp_data')
    # path: pbp_data

    print("##################################################")
    print("######        DOWNLOAD RELAY DATA          #######")
    print("##################################################")

    for year in game_ids.keys():
        start1 = time.time()
        print(" Year {}".format(year))
        if len(game_ids[year]) == 0:
            print('month id is empty')
            print('args: {}'.format(args))
            if lm is not None:
                lm.log('month id is empty')
                lm.log('args : {}'.format(args))
            os.chdir('../..')
            return False

        if not os.path.isdir(str(year)):
            os.mkdir(str(year))
        os.chdir(str(year))
        # path: pbp_data/year

        for month in game_ids[year].keys():
            start2 = time.time()
            print("  Month {}".format(month))
            if len(game_ids[year][month]) == 0:
                print('month id is empty')
                print('args: {}'.format(args))
                if lm is not None:
                    lm.log('month id is empty')
                    lm.log('args : {}'.format(args))
                os.chdir('../..')
                return False

            if not os.path.isdir(str(month)):
                os.mkdir(str(month))
            os.chdir(str(month))
            # path: pbp_data/year/month

            # download
            done = 0
            skipped = 0
            for game_id in game_ids[year][month]:
                game_id_year = int(game_id[:4])
                game_id_date = int(game_id[4:8])
                game_id_team = game_id[8:10]
                if (game_id_year < 2008) or (game_id_year > 7777):
                    skipped += 1
                    continue
                if (game_id_year == today_year) and (game_id_date > today_date):
                    skipped += 1
                    continue
                if game_id_date < int(regular_start[game_id[:4]]):
                    skipped += 1
                    continue
                if game_id_date >= int(playoff_start[game_id[:4]]):
                    skipped += 1
                    continue
                if game_id_team not in teams:
                    skipped += 1
                    continue

                if not check_url2(relay_url):
                    skipped += 1
                    if lm is not None:
                        lm.log('URL error : {}'.format(relay_url))
                    continue

                relay_text_output_file = game_id + '_relay.csv'
                relay_batting_lineup_file = game_id + '_batting.csv'
                relay_pitching_lineup_file = game_id + '_pitching.csv'
                if (int(game_id[:4]) == today_year) &\
                   (int(game_id[4:6]) == now.month) &\
                   (int(game_id[6:8]) == now.day):
                       done = done
                elif (os.path.isfile(relay_text_output_file)) and \
                        (os.path.getsize(relay_text_output_file) > 0):
                    done += 1
                    if lm is not None:
                        lm.log('File Duplicate : {}'.format(game_id))
                    continue

                params = {
                    'gameId': game_id,
                    'half': '1'
                }

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                                  'Chrome/59.0.3071.115 Safari/537.36',
                    'X-Requested-With': 'XMLHttpRequest',
                    'Host': 'm.sports.naver.com',
                    'Referer': 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?&gameId='
                               + game_id
                               + '&tab=relay'
                }

                response = requests.get(relay_url, params=params, headers=headers)

                if (response is not None) & (response.status_code >= 400):
                    txt = {}
                    js = response.json()
                    if isinstance(js, str):
                        js = json.loads(js)
                    last_inning = js['currentInning']

                    if last_inning is None:
                        skipped += 1
                        lm.log('Gameday not found : {}'.format(game_id))
                        continue

                    txt['relayList'] = {}
                    for i in range(len(js['relayList'])):
                        text_index = js['relayList'][i]['no']
                        txt['relayList'][text_index] = js['relayList'][i]
                        texts = txt['relayList'][text_index]['textOptionList']
                        for i in range(len(texts)):
                            texts[i]['text'].encode('cp949', 'ignore')
                    txt['homeTeamLineUp'] = js['homeTeamLineUp']
                    txt['awayTeamLineUp'] = js['awayTeamLineUp']

                    txt['stadium'] = js['schedule']['stadium']

                    response.close()

                    for inn in range(2, last_inning + 1):
                        params = {
                            'gameId': game_id,
                            'half': str(inn)
                        }

                        response = requests.get(relay_url, params=params, headers=headers)
                        if response is not None:
                            js = response.json()
                            if isinstance(js, str):
                                js = json.loads(js)
                            for i in range(len(js['relayList'])):
                                txt['relayList'][js['relayList'][i]['no']] = js['relayList'][i]
                                texts = txt['relayList'][js['relayList'][i]['no']]['textOptionList']
                                for i in range(len(texts)):
                                    texts[i]['text'].encode('cp949', 'ignore')
                        else:
                            skipped += 1
                            if lm is not None:
                                lm.log('Cannot get response : {}'.format(game_id))

                        response.close()

                    # get referee
                    params = {
                        'gameId': game_id
                    }

                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                                      'like Gecko) Chrome/59.0.3071.115 Safari/537.36',
                        'X-Requested-With': 'XMLHttpRequest',
                        'Host': 'm.sports.naver.com',
                        'Referer': 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?gameId='
                                   + game_id
                                   + '&tab=record'
                    }

                    response = requests.get(record_url, params=params, headers=headers)

                    p = regex.compile('(?<=\"etcRecords\":\[)[\\\.\{\}\"0-9:\s\(\)\,\ba-z가-힣\{\}]+')
                    result = p.findall(response.text)
                    if len(result) == 0:
                        txt['referee'] = ''
                    else:
                        txt['referee'] = result[0].split('{')[-1].split('":"')[1].split(' ')[0]

                    response.close()
                    
                    
                    ### 필요한 내용 담아서 저장 ###
                    rl = txt['relayList']

                    tl_keys = []
                    rl_keys = []
                    pts_keys = []
                    for k in rl.keys():
                        keys = rl.get(k).keys()
                        for key in keys:
                            if key in rl_keys:
                                continue
                            else:
                                rl_keys.append(key)

                        for j in range(len(rl.get(k).get('textOptionList'))):
                            keys = rl.get(k).get('textOptionList')[j].keys()
                            for key in keys:
                                if key in tl_keys:
                                    continue
                                else:
                                    tl_keys.append(key)
                        for j in range(len(rl.get(k).get('ptsOptionList'))):
                            keys = rl.get(k).get('ptsOptionList')[j].keys()
                            for key in keys:
                                if key in pts_keys:
                                    continue
                                else:
                                    pts_keys.append(key)

                    tl_keys_copy = tl_keys.copy()
                    if 'currentGameState' in tl_keys:
                        tl_keys_copy.remove('currentGameState')
                    if 'batterRecord' in tl_keys:
                        tl_keys_copy.remove('batterRecord')
                    if 'pitcherResult' in tl_keys:
                        tl_keys_copy.remove('pitcherResult')
                    if 'pitchResult' in tl_keys:
                        tl_keys_copy.remove('pitchResult')
                    if 'pitchNum' in tl_keys:
                        tl_keys_copy.remove('pitchNum')

                    ts_set = []
                    referee = txt['referee']
                    stadium = txt['stadium']
                    for k in rl.keys():
                        for j in range(len(rl.get(k).get('textOptionList'))):
                            ts = rl.get(k).get('textOptionList')[j]

                            ts_dict = {}
                            ts_dict['textOrder'] = int(k)
                            for key in tl_keys_copy:
                                if key == 'playerChange':
                                    if ts.get(key) is not None:
                                        for x in ['outPlayer', 'inPlayer', 'shiftPlayer']:
                                            if x in ts.get(key).keys():
                                                ts_dict[x] = ts.get(key).get(x).get('playerId')

                                else:
                                    ts_dict[key] = None if key not in ts.keys() else ts.get(key)
                            ts_dict['referee'] = referee
                            ts_dict['stadium'] = stadium
                            ts_set.append(ts_dict)
                    if 'playerChange' in tl_keys:
                        tl_keys_copy.remove('playerChange')
                    ts_df = pd.DataFrame(ts_set)
                    ts_df = ts_df.rename(index=str, columns={'ptsPitchId': 'pitchId'})

                    pdata_set = []
                    if len(pts_keys) > 0:
                        for k in rl.keys():
                            for j in range(len(rl.get(k).get('ptsOptionList'))):
                                pdata = rl.get(k).get('ptsOptionList')[j]

                                pdata_dict = {}
                                pdata_dict['textOrder'] = int(k)
                                for key in pts_keys:
                                    pdata_dict[key] = None if key not in pdata.keys() else pdata.get(key)
                                pdata_dict.pop('crossPlateY')
                                pdata_dict.pop('y0')
                                pdata_dict.pop('inn')
                                pdata_dict.pop('ballcount')
                                pdata_set.append(pdata_dict)

                        pdata_df = pd.DataFrame(pdata_set)
                        pdata_df.head()
                    else:
                        pdata_df = None

                    if pdata_df is not None:
                        merge_df = pd.merge(ts_df, pdata_df, how='outer').sort_values(['textOrder', 'seqno'])
                    else:
                        merge_df = ts_df.sort_values(['textOrder', 'seqno'])

                    ######################
                    ### 라인업 다운로드 ###
                    ######################
                    lineup_url = 'https://sports.news.naver.com/gameCenter/gameRecord.nhn?category=kbo&gameId='
                    lurl = lineup_url + game_id
                    lreq = requests.get(lurl)
                    lsoup = BeautifulSoup(lreq.text, 'lxml')
                    lreq.close()

                    scripts = lsoup.find_all('script')
                    team_names = lsoup.find_all('span', attrs={'class': 't_name_txt'})
                    away_team_name = team_names[0].contents[0].split(' ')[0]
                    home_team_name = team_names[1].contents[0].split(' ')[0]
                    contents = None

                    for tag in scripts:
                        if len(tag.contents) > 0:
                            if tag.contents[0].find('DataClass = ') > 0:
                                contents = tag.contents[0]
                                start = contents.find('DataClass = ') + 36
                                end = contents.find('_homeTeam')
                                try:
                                    oldjs = contents[start:end].strip()
                                    while oldjs[-1] != '}':
                                        oldjs = oldjs[:-1]
                                    while oldjs[0] != '{':
                                        oldjs = oldjs[1:]
                                    cont = json.loads(oldjs)
                                    break
                                except Exception as e:
                                    os.chdir('../../..')
                                    print()
                                    print(game_id)
                                    print(oldjs)
                                    print(e)
                                    return
                    
                    bbs = cont.get('battersBoxscore')
                    al = bbs.get('away')
                    hl = bbs.get('home')

                    pos_dict = {'중': '중견수', '좌': '좌익수', '우': '우익수', '유': '유격수', '포': '포수', '지': '지명타자',
                                '一': '1루수', '二': '2루수', '三': '3루수'}

                    posnum_dict = {'중': 8, '좌': 7, '우': 9, '유': 6, '포': 2, '지': 0,
                                '一': 3, '二': 4, '三': 5}
                    homes = []
                    aways = []
                    for i in range(len(hl)):
                        player = hl[i]
                        name = player.get('name')
                        pos = player.get('pos')[0]
                        homes.append({'name': name, 'pos': pos})

                    for i in range(len(al)):
                        player = al[i]
                        name = player.get('name')
                        pos = player.get('pos')[0]
                        aways.append({'name': name, 'pos': pos})

                    ### 라인업 가져다와서 더하기 ###
                    hit_columns = ['name', 'pCode', 'posName', 'pos',
                                'hitType', 'seqno', 'batOrder',
                                'ab', 'hit', 'run', 'rbi',
                                'hr', 'bb', 'so']
                    pit_columns = ['name', 'pCode', 'hitType', 'seqno',
                                'inn', 'run', 'er', 'hit', 'hr',
                                'bb', 'kk', 'hbp', 'wp', 'ballCount']
                    
                    atl = txt.get('awayTeamLineUp')
                    abat = atl.get('batter')
                    apit = atl.get('pitcher')
                    abats = pd.DataFrame(abat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
                    apits = pd.DataFrame(apit, columns=pit_columns).sort_values('seqno')

                    htl = txt.get('homeTeamLineUp')
                    hbat = htl.get('batter')
                    hpit = htl.get('pitcher')
                    hbats = pd.DataFrame(hbat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
                    hpits = pd.DataFrame(hpit, columns=pit_columns).sort_values('seqno')

                    for a in aways:
                        if a.get('pos') == '교':
                            continue
                        abats.loc[abats.name == a.get('name'), 'posName'] = pos_dict.get(a.get('pos'))
                        abats.loc[abats.name == a.get('name'), 'pos'] = posnum_dict.get(a.get('pos'))
                    
                    for h in homes:
                        if h.get('pos') == '교':
                            continue
                        hbats.loc[hbats.name == h.get('name'), 'posName'] = pos_dict.get(h.get('pos'))
                        hbats.loc[hbats.name == h.get('name'), 'pos'] = posnum_dict.get(h.get('pos'))
                    abats['homeaway'] = 'a'
                    hbats['homeaway'] = 'h'
                    apits['homeaway'] = 'a'
                    hpits['homeaway'] = 'h'
                    abats['team_name'] = away_team_name
                    hbats['team_name'] = home_team_name
                    apits['team_name'] = away_team_name
                    hpits['team_name'] = home_team_name

                    bats = pd.concat([abats, hbats])
                    pits = pd.concat([apits, hpits])

                    ### 저장
                    if sys.platform == 'win32':
                        bats.to_csv(relay_batting_lineup_file, index=False, encoding='cp949')
                        pits.to_csv(relay_pitching_lineup_file, index=False, encoding='cp949')
                        merge_df.to_csv(relay_text_output_file, index=False, encoding='cp949')
                    else:
                        bats.to_csv(relay_batting_lineup_file, index=False)
                        pits.to_csv(relay_pitching_lineup_file, index=False)
                        merge_df.to_csv(relay_text_output_file, index=False)

                    done += 1
                else:
                    skipped += 1
                    if lm is not None:
                        lm.log('Cannot get response : {}'.format(game_id))

                print_progress('    Downloading: ', len(game_ids[year][month]), done, skipped)

            # download done
            print_progress('    Downloading: ', len(game_ids[year][month]), done, skipped)
            print('\n        Downloaded {} files'.format(done))
            print('        (Skipped {} files)'.format(skipped))
            end2 = time.time()
            print('            -- elapsed {:.3f} sec for month {}'.format(end2 - start2, month))

            os.chdir('..')
            # path: pbp_data/year
        end1 = time.time()
        print('   -- elapsed {:.3f} sec for year {}'.format(end1 - start1, year))
        # months done
        os.chdir('..')
        # path: pbp_data/
    # years done
    os.chdir('..')
    # path: root
    return True

In [6]:
!pwd

/Users/kpark/work/KBO_pbp_text_crawler


In [29]:
os.chdir('../../..')

In [33]:
!rm pbp_data/2019/3/*relay.csv pbp_data/2019/4/*relay.csv
!rm pbp_data/2019/3/*batting.csv pbp_data/2019/4/*batting.csv
!rm pbp_data/2019/3/*pitching.csv pbp_data/2019/4/*pitching.csv

rm: pbp_data/2019/3/*relay.csv: No such file or directory
rm: pbp_data/2019/4/*relay.csv: No such file or directory


In [6]:
!ls -altr pbp_data/2019/3

total 94200
-rw-r--r--    1 kpark  staff   835551  2 24 17:06 20190323HHOB02019_relay.json
-rw-r--r--    1 kpark  staff    22224  2 24 17:06 20190323HHOB02019_textset.csv
-rw-r--r--    1 kpark  staff    35863  2 24 17:06 20190323HHOB02019_ptsset.csv
-rw-r--r--    1 kpark  staff   847841  2 24 17:06 20190323KTSK02019_relay.json
-rw-r--r--    1 kpark  staff    22201  2 24 17:06 20190323KTSK02019_textset.csv
-rw-r--r--    1 kpark  staff    35996  2 24 17:06 20190323KTSK02019_ptsset.csv
-rw-r--r--    1 kpark  staff   771075  2 24 17:06 20190323LGHT02019_relay.json
-rw-r--r--    1 kpark  staff    20444  2 24 17:06 20190323LGHT02019_textset.csv
-rw-r--r--    1 kpark  staff    33435  2 24 17:06 20190323LGHT02019_ptsset.csv
-rw-r--r--    1 kpark  staff   764633  2 24 17:06 20190323SSNC02019_relay.json
-rw-r--r--    1 kpark  staff    20274  2 24 17:06 20190323SSNC02019_textset.csv
-rw-r--r--    1 kpark  staff    32664  2 24 17:06 20190323SSNC02019_ptsset.csv
-rw-r--r--    1 kpark  staff   89020

In [10]:
args = [3, 10, 2019, 2019]

In [11]:
timetable_url = "https://sports.news.naver.com/kbaseball/schedule/index.nhn?month="

# parse arguments
mon_start = args[0]
mon_end = args[1]
year_start = args[2]
year_end = args[3]

# get game ids
game_ids = {}

for year in range(year_start, year_end + 1):
    year_ids = {}

    for month in range(mon_start, mon_end + 1):
        month_ids = []
        timetable = timetable_url + '{}&year={}'.format(str(month), str(year))

        response = requests.get(timetable)
        table_page = response.text
        response.close()
        soup = BeautifulSoup(table_page, 'lxml')
        buttons = soup.findAll('span', attrs={'class': 'td_btn'})

        for btn in buttons:
            address = btn.a['href']
            game_id = address.split('gameId=')[1]
            month_ids.append(game_id)

        year_ids[month] = month_ids

    game_ids[year] = year_ids

In [7]:
download_relay([3, 10, 2019, 2019])

##################################################
######        DOWNLOAD RELAY DATA          #######
##################################################
 Year 2019
  Month 3
    Downloading: [++++++++++++++++++++++++++++++] 77 / 77, 100.0 %
        Downloaded 40 files
        (Skipped 37 files)
            -- elapsed 56.556 sec for month 3
  Month 4
    Downloading: [++++++++++++++++++++++++++++++] 113 / 113, 100.0 %
        Downloaded 113 files
        (Skipped 0 files)
            -- elapsed 146.795 sec for month 4
  Month 5
    Downloading: [++++++++++++++++++++++++++++++] 132 / 132, 100.0 %
        Downloaded 132 files
        (Skipped 0 files)
            -- elapsed 163.439 sec for month 5
  Month 6
    Downloading: [++++++++++++++++++++++++++++++] 123 / 123, 100.0 %
        Downloaded 123 files
        (Skipped 0 files)
            -- elapsed 144.212 sec for month 6
  Month 7
    Downloading: [++++++++++++++++++++++++++++++] 92 / 92, 100.0 %
        Downloaded 91 files
        (S

True

# 버그 있던거

In [9]:
fname = 'pbp_data/2019/4/20190423HTLG02019_relay.json'
f = open(fname, 'r')
js = json.load(f)
f.close()
rl = js.get('relayList')

stadium = js.get('stadium')
referee = js.get('referee')

tl_keys = []
rl_keys = []
pts_keys = []
for k in rl.keys():
    keys = rl.get(k).keys()
    for key in keys:
        if key in rl_keys:
            continue
        else:
            rl_keys.append(key)
    
    for j in range(len(rl.get(k).get('textOptionList'))):
        keys = rl.get(k).get('textOptionList')[j].keys()
        for key in keys:
            if key in tl_keys:
                continue
            else:
                tl_keys.append(key)
    for j in range(len(rl.get(k).get('ptsOptionList'))):
        keys = rl.get(k).get('ptsOptionList')[j].keys()
        for key in keys:
            if key in pts_keys:
                continue
            else:
                pts_keys.append(key)
tl_keys_copy = tl_keys.copy()
tl_keys_copy.remove('currentGameState')
tl_keys_copy.remove('batterRecord')
tl_keys_copy.remove('playerChange')
tl_keys_copy.remove('pitcherResult')

ts_set = []
for k in rl.keys():
    for j in range(len(rl.get(k).get('textOptionList'))):
        ts = rl.get(k).get('textOptionList')[j]
        
        ts_dict = {}
        ts_dict['textOrder'] = int(k)
        for key in tl_keys_copy:
            ts_dict[key] = None if key not in ts.keys() else ts.get(key)
        ts_dict['referee'] = referee
        ts_dict['stadium'] = stadium
        ts_set.append(ts_dict)
ts_df = pd.DataFrame(ts_set)
ts_df = ts_df.rename(index=str, columns={'ptsPitchId': 'pitchId'})

pdata_set = []
if len(pts_keys) > 0:
    for k in rl.keys():
        for j in range(len(rl.get(k).get('ptsOptionList'))):
            pdata = rl.get(k).get('ptsOptionList')[j]

            pdata_dict = {}
            pdata_dict['textOrder'] = int(k)
            for key in pts_keys:
                pdata_dict[key] = None if key not in pdata.keys() else pdata.get(key)
            pdata_set.append(pdata_dict)
        
    pdata_df = pd.DataFrame(pdata_set)
    pdata_df.head()
else:
    pdata_df = None

if pdata_df is not None:
    merge_df = pd.merge(ts_df, pdata_df, how='outer')
    # merge_df.to_csv('test.csv', index=False)
else:
    merge_df = ts_df

# download

- 지금은 모바일 페이지 들어가서 이것저것 긁어오는 중.
- `relayText.nhn`은 문자중계, 홈라인업, 어웨이라인업, 구장이름(`stadium`)
- `record.nhn`은 심판 이름(`referee`)
- `request`를 보낼 때 `params`를 파라미터로 같이 보내고, 이 중 `half` 파라미터에 이닝이 들어간다.
    - 이 이닝에 나온 문자중계만 `response`로 받음
- 리스폰스 객체의 `.json()` 함수로 `JSON` 형태의 데이터를 추출 가능
- 안에 `relayList`, `currentInning`, `homeTeamLineUp`, `awayTeamLineUp`, `schedule` 등의 키를 활용
    - `relayList`: 문자중계 내역
    - `currentInning`: 실시간 현재 이닝 -> 경기 끝난 후에는 경기 마지막 이닝
    - `homeTeamLineUp`: 홈팀 라인업
    - `awayTeamLineUp`: 어웨이 라인업
    - `schedule`: 진행중인 중계 관련 내용이 있는데 여기서 `stadium` 키를 쓰면 구장이 나온다

- `relayList`로 얻어온 value는 리스트 형태, 별도의 의미있는 인덱스는 없음(0, 1, ...), 리스트 엘리먼트 각각은 dictionary
- `relayList` 딕셔너리 내부에 `no`라는 키가 있는데 경기 문자중계 전체 단위 인덱스(순서)
    - 0번이 시작(__1회초 XX 공격__), 마지막 N번이 경기종료 메시지(__승리투수 등 표시__)
- 문자중계 내용은 `relayList` 딕셔너리의 `textOptionList` 키로 얻을 수 있는 리스트에 있음
- PTS 데이터는 `relayList` 딕셔너리의 `ptsOptionList` 키로 얻을 수 있는 리스트에 있음
    - 경우에 따라서 누락된 데이터도 있음
    - 순서가 꼭 문자중계와 맞지는 않음
    - ptsPitchId 키 값으로 매치해야함
- `textOptionList` 리스트 안의 엘리먼트 하나는 또 딕셔너리 object
    - 1구1구 단위 메타데이터 포함
    - 꼭 필요한(항상 나오는) 요소는...
        - `seqno`
        - `text`
        - `type`
        - `stuff` 등등
- 지금까지는 json으로 얻어온 object를 JSON 포맷으로 통채로 저장
- 앞으로는 object를 dataframe으로 바꾼 다음에 csv 형태로 보기 편하게 바꿔서 저장하는 쪽으로

- 다운로드 과정에서 Unicode 변환 안되는 텍스트는 누락 처리(blank text, '')했음
    - `try`, `catch` 써서 `UnicodeEncodeError` 나올 때 별도처리
- `.encode` 함수의 `errors` 파라미터를 `ignore`로 설정하면 에러 없이 해결 가능
    - 그렇게 바꿈

# modify download

In [11]:
args = [4, 4, 2019, 2019]

relay_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/relayText.nhn'
record_url = 'http://m.sports.naver.com/ajax/baseball/gamecenter/kbo/record.nhn'

# game_ids = get_game_ids(args)
# game_id = '20180717LGWO02018' ## 유니코드 버그 있는 경기
game_id = '20190423HTLG02019'

In [12]:
params = {
    'gameId': game_id,
    'half': '1'
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/59.0.3071.115 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'Host': 'm.sports.naver.com',
    'Referer': 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?&gameId='
               + game_id
               + '&tab=relay'
}

response = requests.get(relay_url, params=params, headers=headers)

txt = {}
js = response.json()
if isinstance(js, str):
    js = json.loads(js)
last_inning = js['currentInning']

txt['relayList'] = {}

for i in range(len(js['relayList'])):
    txt['relayList'][js['relayList'][i]['no']] = js['relayList'][i]
    texts = txt['relayList'][js['relayList'][i]['no']]['textOptionList']
    for i in range(len(texts)):
        texts[i]['text'].encode('cp949', 'ignore')
txt['homeTeamLineUp'] = js['homeTeamLineUp']
txt['awayTeamLineUp'] = js['awayTeamLineUp']

txt['stadium'] = js['schedule']['stadium']

response.close()

for inn in range(2, last_inning + 1):
    params = {
        'gameId': game_id,
        'half': str(inn)
    }

    response = requests.get(relay_url, params=params, headers=headers)
    if response is not None:
        js = response.json()
        response.close()
        if isinstance(js, str):
            js = json.loads(js)

        for i in range(len(js['relayList'])):
            txt['relayList'][js['relayList'][i]['no']] = js['relayList'][i]
            texts = txt['relayList'][js['relayList'][i]['no']]['textOptionList']
            for i in range(len(texts)):
                texts[i]['text'].encode('cp949', 'ignore')
    response.close()

params = {
    'gameId': game_id
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
                  'like Gecko) Chrome/59.0.3071.115 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'Host': 'm.sports.naver.com',
    'Referer': 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?gameId='
               + game_id
               + '&tab=record'
}

response = requests.get(record_url, params=params, headers=headers)

p = regex.compile('(?<=\"etcRecords\":\[)[\\\.\{\}\"0-9:\s\(\)\,\ba-z가-힣\{\}]+')
result = p.findall(response.text)
if len(result) == 0:
    txt['referee'] = ''
else:
    txt['referee'] = result[0].split('{')[-1].split('":"')[1].split(' ')[0]
    
response.close()

# team lineup

- 모바일 아닌 PC 페이지에서 가져와야 한다.
- 라인업 내용은 페이지 html에 하드코딩되어있지 않다.
- jquery? 사용한 자바스크립트 형태(아마도) 스크립트로 경기 메타를 object로 저장하고 있다.
- 이걸 불러와서 페이지에 뿌리는 식이다.
- 그래서 다음 순서로 진행한다.
    1. PC 페이지 html에서 BS4를 써서 `<script>` 있는 부분만 싹 긁어온다.
    2. 원하는 object 코드가 있는 스크립트를 찾는다. (object를 define하는 코드 텍스트를 검색)
    3. object 부분만 긁어온다. JSON 호환가능한 코드
    4. JSON 객체로 바꾼다.
    5. 라인업 부분만 가져온다.

- 가져온 라인업 내용은 batting order 순서대로 나열되어있다.
- key를 `'pos'`로 입력해서 포지션 내용만 가져올 수 있다.
- 맨앞의 글자가 선발 당시 라인업이다.
    - 도중 교체 출전은 '교', 지명타자는 '지'
    - 1, 2, 3루수는 한자로 一, 二, 三
    - 나머지는 포, 유, 좌, 중, 우

- 결과 기록 페이지라서 게임단위 메타 데이터만 있다.
- 선수 별로는 pCode 정도가 전부
- 모바일 페이지에서 긁어온 것과 합쳐야 한다.

# 버그있던 경기 체크

In [13]:
atl = txt.get('awayTeamLineUp')
abat = atl.get('batter')
apit = atl.get('pitcher')
hit_columns = ['name', 'pCode', 'posName', 'pos',
               'hitType', 'seqno', 'batOrder',
               'ab', 'hit', 'run', 'rbi',
               'hr', 'bb', 'so']
pit_columns = ['name', 'pCode', 'hitType', 'seqno',
               'inn', 'run', 'er', 'hit', 'hr',
               'bb', 'kk', 'hbp', 'wp', 'ballCount']
bats = pd.DataFrame(abat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
pits = pd.DataFrame(apit, columns=pit_columns).sort_values('seqno')

In [14]:
lineup_url = 'https://sports.news.naver.com/gameCenter/gameRecord.nhn?category=kbo&gameId='
lurl = lineup_url + '20190423HTLG02019'
lreq = requests.get(lurl)
lsoup = BeautifulSoup(lreq.text, 'lxml')
lreq.close()

scripts = lsoup.find_all('script')
text = None

for tag in scripts:
    if len(tag.contents) > 0:
        if tag.contents[0].find('DataClass = ') > 0:
            contents = tag.contents[0]
            start = contents.find('DataClass = ') + 36
            end = contents.find('}}}') + 3
            oldjs = contents[start:end]
            contents = json.loads(oldjs)
            break
            
bbs = contents.get('battersBoxscore')
al = bbs.get('away')
hl = bbs.get('home')

pos_dict = {'중': '중견수', '좌': '좌익수', '우': '우익수', '유': '유격수', '포': '포수', '지': '지명타자',
            '一': '1루수', '二': '2루수', '三': '3루수'}

posnum_dict = {'중': 8, '좌': 7, '우': 9, '유': 6, '포': 2, '지': 0,
            '一': 3, '二': 4, '三': 5}

homes = []
aways = []
for i in range(len(hl)):
    player = hl[i]
    name = player.get('name')
    pos = player.get('pos')[0]
    homes.append({'name': name, 'pos': pos})

for i in range(len(al)):
    player = al[i]
    name = player.get('name')
    pos = player.get('pos')[0]
    aways.append({'name': name, 'pos': pos})

hit_columns = ['name', 'pCode', 'posName', 'pos',
               'hitType', 'seqno', 'batOrder',
               'ab', 'hit', 'run', 'rbi',
               'hr', 'bb', 'so']
pit_columns = ['name', 'pCode', 'hitType', 'seqno',
               'inn', 'run', 'er', 'hit', 'hr',
               'bb', 'kk', 'hbp', 'wp', 'ballCount']

atl = txt.get('awayTeamLineUp')
abat = atl.get('batter')
apit = atl.get('pitcher')
abats = pd.DataFrame(abat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
apits = pd.DataFrame(apit, columns=pit_columns).sort_values('seqno')

htl = txt.get('homeTeamLineUp')
hbat = htl.get('batter')
hpit = htl.get('pitcher')
hbats = pd.DataFrame(hbat, columns=hit_columns).sort_values(['batOrder', 'seqno'])
hpits = pd.DataFrame(hpit, columns=pit_columns).sort_values('seqno')

abats2 = abats.copy()
hbats2 = hbats.copy()
for a in aways:
    if a.get('pos') == '교':
        continue
    abats2.loc[abats2.name == a.get('name'), 'posName'] = pos_dict.get(a.get('pos'))
    abats2.loc[abats2.name == a.get('name'), 'pos'] = posnum_dict.get(a.get('pos'))
for h in homes:
    if h.get('pos') == '교':
        continue
    hbats2.loc[hbats2.name == h.get('name'), 'posName'] = pos_dict.get(a.get('pos'))
    hbats2.loc[hbats2.name == h.get('name'), 'pos'] = posnum_dict.get(a.get('pos'))
hbats2['homeaway'] = 'h'
abats2['homeaway'] = 'a'