## https://baseball.sports.smt.docomo.ne.jp/result/games/ からデータをスクレイピング

In [1]:
from selenium import webdriver
import re
import pandas as pd
import time
from tqdm import tqdm_notebook as tq
import sys , os

sys.path.append("..")
from utils.util_functions import *

In [2]:
def get_img_list(driver):
    img_list = driver.find_elements_by_tag_name("img")
    img_list = [img_list[i].get_attribute("src") for i in range(len(img_list)) if "result/img_runner" in img_list[i].get_attribute("src")]
    return img_list

In [3]:
def get_play_list(driver):    
    tr_list = driver.find_elements_by_tag_name("tr")
    play_list = []
    for instance in tr_list:
        text = instance.text
        if re.match("^\d\n.+\n.+ \d \d+\-\d+$", text):
            play_list.append(text)
    return play_list

In [4]:
def get_top_team(driver):
    return driver.find_element_by_xpath('//*[@id="game"]/div/div[1]/table/tbody/tr[2]/td[1]').text

def get_bot_team(driver):
    return driver.find_element_by_xpath('//*[@id="game"]/div/div[1]/table/tbody/tr[3]/td[1]').text

In [5]:
def img_to_base(img_text):
    base = ["0", "0", "0", "0"]
    for i in re.findall("\d", img_text):
        base[int(i)] = "1"
    return "".join(base)

In [6]:
def text_to_batter_data(text):
    lst = re.split("\n|\s", text)
    if len(lst) > 1:
        lst = lst[:-1] + lst[-1].split("-")
    return lst
    #return re.split("\n|\s|\-", text)

In [7]:
def get_url_list(date, year):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    
    url = "https://baseball.sports.smt.docomo.ne.jp/mlb/result/{}/{}.html".format(year, date)
    driver.get(url)
    link_list = driver.find_elements_by_tag_name("a")
    
    res = []
    for link in link_list:
        link_url = link.get_attribute("href")
        #print(link_url)
        if type(link_url)==str and re.match("^.+201(7|8)\/\d{7}\/$", link_url):
            res.append(link_url)
    
    driver.quit()
    
    return res

In [8]:
def make_csv_mlb(url, save_dir=DETAIL_DATA_DIR_MLB):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    #driver = webdriver.Chrome()
    driver.get(url)
    
    try:
        top_team = get_top_team(driver)
        #print(top_team)
        bot_team = get_bot_team(driver)
        #print(bot_team)
    except:
        # page does not exist
        return
    
    img_list = get_img_list(driver)
    play_list = get_play_list(driver)
    
    columns = ["状況", "打順", "打者", "結果", "アウト", "裏得点", "表得点"]
    res = []
    for img, play in zip(img_list, play_list):
        #print(play)
        res.append([img_to_base(img)]+text_to_batter_data(play))
    #display(pd.DataFrame(res))
    df = pd.DataFrame(res, columns=columns).iloc[::-1]
    
    innings = []
    top_bots = []
    attacks = []
    curr_inning = 1
    top_bot = 1
    for out in df["アウト"]:
        innings.append(curr_inning)
        top_bots.append(top_bot)
        if top_bot == 1:
            attacks.append(top_team)
        elif top_bot == -1:
            attacks.append(bot_team)
        if out == "3":
            top_bot *= -1
            if top_bot == 1:
                curr_inning += 1
                
    df["イニング"] = innings
    df["表裏"] = ["表" if top_bot==1 else "裏" for top_bot in top_bots]
    df["攻撃チーム"] = attacks
    
    outs = df["アウト"].shift(1).fillna("0")
    outs = outs.replace("3", "0")
    
    cases = []
    for out, case in zip(outs, df["状況"]):
        cases.append(out + case[1:])
    df["状況"] = cases
    
    df = df.reset_index(drop=True).astype(str)
    
    df.to_csv(os.path.join(save_dir, url.split("/")[-2]+".csv"), encoding="cp932")
    #return df
    driver.quit()
    #print(url)

In [9]:
#make_csv_mlb("https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/2071783/")#.to_csv("aaa.csv")

In [10]:
#pd.read_csv("aaa.csv", dtype=str)

In [11]:
#notyet_list = set(range(1703959, 2093884)) - set(pd.Series(os.listdir(DETAIL_DATA_DIR_MLB)[1:]).apply(lambda x: int(x.split(".")[0])).values)

In [12]:
"""notyet_url_list = ["https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/{}/".format(num) for num in notyet_list]
for url in notyet_url_list:
    try:
        make_csv_mlb(url)
        time.sleep(0.1)
    except:
        print(url)"""

'notyet_url_list = ["https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/{}/".format(num) for num in notyet_list]\nfor url in notyet_url_list:\n    try:\n        make_csv_mlb(url)\n        time.sleep(0.1)\n    except:\n        print(url)'

In [13]:
s = """2018-08-10 00:00:00
2018-08-09 00:00:00
2018-08-05 00:00:00
2018-07-25 00:00:00
2018-07-23 00:00:00
2018-07-22 00:00:00
2018-07-21 00:00:00
2018-07-16 00:00:00
2018-07-15 00:00:00
2018-07-14 00:00:00
2018-07-12 00:00:00
2018-07-10 00:00:00
2018-07-09 00:00:00
2018-07-08 00:00:00
2018-07-06 00:00:00
2018-06-28 00:00:00
2018-06-26 00:00:00
2018-06-21 00:00:00
2018-06-18 00:00:00
2018-06-17 00:00:00
2018-06-16 00:00:00
2018-06-15 00:00:00
2018-06-11 00:00:00
2018-06-10 00:00:00
2018-06-09 00:00:00
2018-06-08 00:00:00
2018-06-07 00:00:00
2018-06-06 00:00:00
2018-06-04 00:00:00
2018-06-02 00:00:00
2018-06-01 00:00:00
2018-05-31 00:00:00
2018-05-30 00:00:00
2018-05-29 00:00:00
2018-05-28 00:00:00
2018-05-27 00:00:00
2018-05-26 00:00:00
2018-05-25 00:00:00
2018-05-23 00:00:00
2018-05-20 00:00:00
2018-05-18 00:00:00
2018-05-17 00:00:00
2018-05-16 00:00:00
2018-05-14 00:00:00
2018-05-10 00:00:00
2018-05-09 00:00:00
2018-05-08 00:00:00
2018-05-07 00:00:00
2018-05-06 00:00:00
2018-05-05 00:00:00
2018-05-04 00:00:00
2018-05-03 00:00:00
2018-04-30 00:00:00
2018-04-29 00:00:00
2018-04-26 00:00:00
2018-04-25 00:00:00
2018-04-24 00:00:00
2018-04-22 00:00:00
2018-04-21 00:00:00
2018-04-19 00:00:00
2018-04-18 00:00:00
2018-04-11 00:00:00
2018-04-08 00:00:00
2017-10-02 00:00:00
2017-09-23 00:00:00
2017-09-20 00:00:00
2017-09-10 00:00:00
2017-09-06 00:00:00
2017-08-23 00:00:00
2017-08-10 00:00:00
2017-07-30 00:00:00
2017-07-24 00:00:00
2017-07-17 00:00:00
2017-07-06 00:00:00
2017-07-02 00:00:00
2017-06-27 00:00:00
2017-06-20 00:00:00
2017-06-15 00:00:00
2017-06-07 00:00:00
2017-06-04 00:00:00
2017-05-27 00:00:00
2017-05-22 00:00:00
2017-05-11 00:00:00
2017-04-27 00:00:00
2017-04-20 00:00:00"""

In [14]:
# 2017/4/3スタート
start_date = "20180912"
days = 700

#for i in tq(range(days)):
    #date = pd.to_datetime(start_date) - pd.Timedelta(days=i)
for (year, month, date) in tq(pd.Series(s.split("\n")).apply(lambda x: re.split("\-|\s", x)[:3])):
    #year_string = str(date.year)
    #date_string = "{0:02d}".format(date.month) + "{0:02d}".format(date.day)
    year_string = year
    date_string = month + date
    
    for url in get_url_list(date_string, year_string):
        try:
            make_csv_mlb(url)
            time.sleep(0.1)
        except:
            print(url)

HBox(children=(IntProgress(value=0, max=85), HTML(value='')))

https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/1996952/
https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/1996135/
https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/1996134/



In [None]:
#pd.Series(s.split("\n")).apply(lambda x: re.split("\-|\s", x)[:3])

In [None]:
get_url_list("0810", "2018")

In [None]:
make_csv_mlb( 'https://baseball.sports.smt.docomo.ne.jp/mlb/result/games/2018/1996687/')

In [None]:
//*[@id="game"]/div/div[1]/table/tbody/tr[2]/td[1]

In [None]:
img_to_base(img_list[0])

In [None]:
make_csv_mlb(img_list, play_list)

In [None]:
pd.DataFrame([img_list, play_list]).T[0].apply(img_to_base)

In [None]:
re.match("^\d\n.+\n.+ \d \d+\-\d+$", driver.find_elements_by_tag_name("tr")[10].text)

In [None]:
^\d\n.+\n.+\n\s\d\s\d+\-\d$

In [None]:
driver.find_elements_by_tag_name("tr")[7].get_attribute("")

In [None]:
res = []
curr = []
for elem in driver.find_elements_by_xpath('//*[@id="inningDetail"]')[0].text.split("\n"):
    if re.match("\d+回$", elem):
        continue
    if re.match("\d+回(表|裏)", elem):
        res.append(curr)
        curr = [elem]
        continue
    curr.append(elem)

In [None]:
res