In [8]:
import requests
import lxml.html
import numpy as np
import re
import datetime
import time
import random
import json
import sys
from itertools import product

# code table for cources
kaijoList = [
  ["桐生",   "01"],                                                                                                                                                                                                                           
  ["戸田",   "02"],
  ["江戸川", "03"],
  ["平和島", "04"],
  ["多摩川", "05"],
  ["浜名湖", "06"],
  ["蒲郡",   "07"],
  ["常滑",   "08"],
  ["津",     "09"],
  ["三国",   "10"],
  ["びわこ", "11"],
  ["住之江", "12"],
  ["尼崎",   "13"],
  ["鳴門",   "14"],
  ["丸亀",   "15"],
  ["児島",   "16"],
  ["宮島",   "17"],
  ["徳山",   "18"],
  ["下関",   "19"],
  ["若松",   "20"],
  ["芦屋",   "21"],
  ["福岡",   "22"],
  ["唐津",   "23"],
  ["大村",   "24"],
]

def getDetailHtml(ymd, kaijo, raceNo):
    payload = {'ymd': ymd, 'k': kaijo, 'r': raceNo}
    r = requests.post("http://kyotei.sakura.ne.jp/bangumi2.php", data=payload)
    if r.status_code != 200:
        print("Http Error:", r.status_code)
        return None
    return r.content

def getResultHtml(ymd, kaijo, raceNo):
    url = "http://boatrace.jp/owpc/pc/race/raceresult?rno=1&jcd=01&hd=20180906"
    r = requests.get(url)
    if r.status_code != 200:
        print("Http Error:", r.status_code)
        return None
    return r.content

def selRaceResult(ymd, kaijo, raceNo):
    """Return Race Result"""
    html = getResultHtml(ymd, kaijo, raceNo)
    if html == None:
        return None
    
    dom = lxml.html.fromstring(html)
    racerOrder = dom.xpath("//span[@class='is-fs12']/text()")
    frameOrder = dom.xpath("//td[contains(@class, 'is-fs14 is-fBold')]/text()")
    if racerOrder == None or len(racerOrder) != 6:
        print("Invalid racerOrder data")
        return None
    if frameOrder == None or len(frameOrder) != 6:
        print("Invalid frameOrder data")
        return None
    return (racerOrder, frameOrder)

def selRacerData(ymd, kaijo, raceNo):
    """Return Racer's data"""
    html = getDetailHtml(ymd, kaijo, raceNo)
    if html == None:
        return None
    
    removeSpaces = lambda s: re.sub(r'\s', '', s)
    removeBracketsAndPercent = lambda s: re.sub(r'[()%]', '', s)
    
    dom = lxml.html.fromstring(html)
    
    tmp = dom.xpath("//table[@cellspacing=1]//td//a/text()")
    tmp = list(map(removeSpaces, tmp))
    racerRegNos  = tmp[1:]
    
    ranks = []
    capabilities = []
    globalWinRates = []
    localWinRates = []
    morter2Rates = []
    winCounts = []
    winRates = []
    secondCounts = []
    secondRates = []
    
    lines = dom.xpath("//table[@cellspacing=1]//tr")
    for l in lines:
        dats = l.xpath(".//font/text()")
        if len(dats) == 0:
            continue
        if re.match(r'.*級.*', dats[0]):
            ranks = list(map(removeSpaces, l.xpath(".//td[@colspan=3]//font/text()")))
            
        if re.match(r'.*能力.*', dats[0]):
            capabilities = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
        
        if re.match(r'.*全国勝率.*', dats[0]):
            globalWinRates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*当地勝率.*', dats[0]):
            localWinRates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*モーター2率.*', dats[0]):
            morter2Rates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*1着回数.*', dats[0]):
            winCounts = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            winRates = list(map(removeSpaces, l.xpath(".//td/font[@color='#666666']/text()")))
            winRates = list(map(removeBracketsAndPercent, winRates))
            
        if re.match(r'.*2着回数.*', dats[0]):
            secondCounts = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            secondRates = list(map(removeSpaces, l.xpath(".//td/font[@color='#666666']/text()")))
            secondRates = list(map(removeBracketsAndPercent, secondRates))
    
    racerData = []
    for i in range(len(racerRegNos)):
        try:
            d = { 
                "racerRegNo":    racerRegNos[i],
                "rank":          ranks[i],
                "capability":   capabilities[i],
                "globalWinRate": globalWinRates[i],
                "localWinRate":  localWinRates[i],
                "morter2Rate":   morter2Rates[i],
                "winCount":      winCounts[i],
                "winRate":       winRates[i],
                "secondCount":   secondCounts[i],
                "secondRate":    secondRates[i],
            }
            racerData.append(d)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            return None
        
    return racerData

def scrapeRaceData(ymd, kaijoJp, kaijo, raceNo):
    racerData = selRacerData(ymd, kaijo, raceNo)
    racerOrder, frameOrder = selRaceResult(ymd, kaijo, raceNo)
    if racerData == None or racerOrder == None or frameOrder == None:
        print("Invalid data")
        return None
    
    return {
        "ymd": ymd,
        "kaijoJp": kaijoJp,
        "kaijo": kaijo,
        "raceNo": raceNo,
        "racerData": racerData,
        "racerOrder": racerOrder,
        "frameOrder": frameOrder,
    }

if __name__ == '__main__':
    dt = datetime.datetime(2018, 7, 1)
    dtEnd = datetime.datetime(2018, 8, 31)
    results = []
    while dt <= dtEnd:
        dt = dt + datetime.timedelta(days=1)
        ymd = dt.strftime("%Y%m%d")
        for kj, k, r in [(k[0], k[1], r) for k in kaijoList for r in range(1, 13)]:
            print("fetching ymd:", ymd, "kaijoJp:" kj, "kaijo:", k, "raceNo:", r)
            dat = scrapeRaceData(ymd, kj, k, r)
            time.sleep(random.random() * 7 + 3)
            if dat == None:
                print("No results")
                break
            results.append(dat)
            
    with open('boatrace_results.json', mode='w') as f:
        json.dump(results, f, ensure_ascii=False)
    
    print("done!")

fetching ymd: 20180906 kaijo: 01 raceNo: 1
curr:  1
fetching ymd: 20180906 kaijo: 01 raceNo: 2
curr:  2
fetching ymd: 20180906 kaijo: 01 raceNo: 3
curr:  3
fetching ymd: 20180907 kaijo: 01 raceNo: 1
curr:  4
done!
