## https://baseball.sports.smt.docomo.ne.jp/result/games/ からデータをスクレイピング

In [1]:
from selenium import webdriver
import re
import pandas as pd
import time
from tqdm import tqdm_notebook as tq

In [2]:
def encode_count(out, base):
    return str(out) + str(int(base["1"])) + str(int(base["2"])) + str(int(base["3"]))

In [3]:
def trial_and_error(func, arg):
    try:
        return func(arg)
    except:
        return None

In [4]:
# dmenu
# ランナーは前、アウトカウントは後
def find_info(driver, name):
    res_list = []
    out = 0
    base = dict({"1": False, "2": False, "3": False})
    
    while True:
        try:
            runners = driver.find_element_by_xpath('//*[@id="liveArea"]/div/div[2]/div[3]/div[1]')
        except:
            # ページが存在しない時
            return
        base = dict({"1": False, "2": False, "3": False})
        for elem in runners.find_elements_by_class_name("on"):
            if "first" in elem.get_attribute("class"):
                base["1"] = True
            if "second" in elem.get_attribute("class"):
                base["2"] = True
            if "third" in elem.get_attribute("class"):
                base["3"] = True
                
        inning = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/div/div[2]/div[1]')
                
        fielding_team = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[1]/li[1]')
        pitcher = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[1]/li[4]/a')
        pitcher_dominant_hand = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[1]/li[5]')
        era = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[3:], '//*[@id="liveArea"]/ul[1]/li[6]/ul/li[1]')
        num_pitching = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[3:], '//*[@id="liveArea"]/ul[1]/li[6]/ul/li[2]')
        num_strikeout = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[3:], '//*[@id="liveArea"]/ul[1]/li[6]/ul/li[3]')
        lost_points = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[2:], '//*[@id="liveArea"]/ul[1]/li[6]/ul/li[4]')
        
        batting_team = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[2]/li[1]')
        batter = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[2]/li[4]/a')
        batter_dominant_bat = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/ul[2]/li[5]')
        batting_average = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[2:], '//*[@id="liveArea"]/ul[2]/li[6]/ul/li[1]')
        num_homerun = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[3:], '//*[@id="liveArea"]/ul[2]/li[6]/ul/li[2]')
        points = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[2:], '//*[@id="liveArea"]/ul[2]/li[6]/ul/li[3]')
        num_fourball = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[2:], '//*[@id="liveArea"]/ul[2]/li[6]/ul/li[4]')
        num_steal = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text[2:], '//*[@id="liveArea"]/ul[2]/li[6]/ul/li[5]')
        
        result = trial_and_error(lambda arg: driver.find_element_by_xpath(arg).text, '//*[@id="liveArea"]/div/div[1]/table/tbody/tr/td[2]')
        
        total_current_list = [encode_count(out, base), \
                              inning, \
                              fielding_team, pitcher, pitcher_dominant_hand, era, \
                              batting_team, batter, batter_dominant_bat, batting_average, num_homerun, points, num_fourball, num_steal, result]
        
        res_list.append(total_current_list)
        #print(encode_count(out, base))
        # アウトは更新後に更新
        out = len(driver.find_element_by_xpath('//*[@id="liveArea"]/div/div[2]/div[2]/table/tbody/tr[3]').find_elements_by_class_name("on"))
        if out == 3:
            res_list.append([encode_count(3, dict({"1": False, "2": False, "3": False}))])
            #print(encode_count(out, base))
            out = 0
                
        try:
            if driver.find_element_by_xpath('//*[@id="liveArea"]/table/tbody/tr/td[3]/p').text == "次打者":
                driver.find_element_by_xpath('//*[@id="liveArea"]/table/tbody/tr/td[3]/p').click() 
                #time.sleep(0.1)
            else:
                # 次がない
                break
        except:
            #　ゲームセット
            break
        
    #return res_list
    columns = ["状況", "回", "守備チーム", "投手", "利き手", "防御率", "攻撃チーム", "打者", "打席", "打率", "本塁打数", "打点", "四球数", "盗塁数", "結果"]
    #print("./game_num_data/"+name+".csv")
    pd.DataFrame(res_list, dtype="object", columns=columns).to_csv("./game_num_data/"+name+".csv", encoding="cp932")

In [6]:
#dmenu
today = "20180930"
for i in tq(range(2)):
    date = pd.to_datetime(today) - pd.Timedelta(days=i)
    date_string = str(date.year) + "{0:02d}".format(date.month) + "{0:02d}".format(date.day) + "0"
    for j in range(1, 7):
        new_date_string = date_string + str(j)
        #print(new_date_string)
        #if new_date_string in ["2018080701", "2018080702", "2018080703"]:
        #    continue
        url = "https://baseball.sports.smt.docomo.ne.jp/result/games/live_{}_01101.html".format(new_date_string)
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(1)
        try:
            find_info(driver, new_date_string)
        except:
            #raise
            pass
        driver.quit()

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/konoharuki/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-b0bf55899e1f>", line 13, in <module>
    driver.get(url)
  File "/Users/konoharuki/anaconda/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 332, in get
    self.execute(Command.GET, {'url': url})
  File "/Users/konoharuki/anaconda/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 318, in execute
    response = self.command_executor.execute(driver_command, params)
  File "/Users/konoharuki/anaconda/lib/python3.5/site-packages/selenium/webdriver/remote/remote_connection.py", line 472, in execute
    return self._request(command_info[0], url, body=data)
  File "/Users/konoharuki/anaconda/lib/python3.5/site-packages/selenium/webdriver/remote/remote_connection.py", line 496, in _request
    resp = self._conn.ge

KeyboardInterrupt: 

In [12]:
url = "https://baseball.sports.smt.docomo.ne.jp/result/games/live_{}_01101.html".format(new_date_string)
driver = webdriver.Chrome()
driver.get(url)

KeyboardInterrupt: 

In [None]:
driver.find_element_by_xpath('//*[@id="liveArea"]/div/div[2]/div[1]').text

In [None]:
a = [1, 2, 3]
b = [4, 5]
c = [a, b]

In [41]:
pd.DataFrame(c, columns=["a", "b", "c"])

Unnamed: 0,a,b,c
0,1,2,3.0
1,4,5,
