In [110]:
import requests
import lxml.html
import numpy as np
import re

# code table for cources
whereCds = [
  ["桐生",   "01"],                                                                                                                                                                                                                           
  ["戸田",   "02"],
  ["江戸川", "03"],
  ["平和島", "04"],
  ["多摩川", "05"],
  ["浜名湖", "06"],
  ["蒲郡",   "07"],
  ["常滑",   "08"],
  ["津",     "09"],
  ["三国",   "10"],
  ["びわこ", "11"],
  ["住之江", "12"],
  ["尼崎",   "13"],
  ["鳴門",   "14"],
  ["丸亀",   "15"],
  ["児島",   "16"],
  ["宮島",   "17"],
  ["徳山",   "18"],
  ["下関",   "19"],
  ["若松",   "20"],
  ["芦屋",   "21"],
  ["福岡",   "22"],
  ["唐津",   "23"],
  ["大村",   "24"],
]

def selRaceResult(html):
    """Return Race Result"""
    dom = lxml.html.fromstring(html)
    racerOrder = dom.xpath("//span[@class='is-fs12']/text()")
    frameOrder = dom.xpath("//td[contains(@class, 'is-fs14 is-fBold')]/text()")
    return (racerOrder, frameOrder)

def selRacerData(html):
    """Return detail data of racers from html"""
    
    removeSpaces = lambda s: re.sub(r'\s', '', s)
    removeBracketsAndPercent = lambda s: re.sub(r'[()%]', '', s)
    
    dom = lxml.html.fromstring(html)
    data = dom.xpath("//table[@cellspacing=1]//td//a/text()")
    data = list(map(removeSpaces, data))
    raceNo = data[0]
    racerRegNos  = data[1:]
    racerNum = len(racerRegNos)
    
    ranks = []
    capabilities = []
    globalWinRates = []
    localWinRates = []
    morter2Rates = []
    winCounts = []
    winRates = []
    secondCounts = []
    secondRates = []
    
    lines = dom.xpath("//table[@cellspacing=1]//tr")
    
    for l in lines:
        dats = l.xpath(".//font/text()")
        if len(dats) == 0:
            continue
        if re.match(r'.*級.*', dats[0]):
            ranks = list(map(removeSpaces, l.xpath(".//td[@colspan=3]//font/text()")))
            
        if re.match(r'.*能力.*', dats[0]):
            capabilities = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
        
        if re.match(r'.*全国勝率.*', dats[0]):
            globalWinRates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*当地勝率.*', dats[0]):
            localWinRates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*モーター2率.*', dats[0]):
            morter2Rates = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            
        if re.match(r'.*1着回数.*', dats[0]):
            winCounts = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            winRates = list(map(removeSpaces, l.xpath(".//td/font[@color='#666666']/text()")))
            winRates = list(map(removeBracketsAndPercent, winRates))
            
        if re.match(r'.*2着回数.*', dats[0]):
            secondCounts = list(map(removeSpaces, l.xpath(".//td/strong/font/text()")))
            secondRates = list(map(removeSpaces, l.xpath(".//td/font[@color='#666666']/text()")))
            secondRates = list(map(removeBracketsAndPercent, secondRates))
    
    racerData = []
    for i in range(len(racerRegNos)):
        d = { 
            "racerRegNo":    racerRegNos[i],
            "rank":          ranks[i],
            "capability":   capabilities[i],
            "globalWinRate": globalWinRates[i],
            "localWinRate":  localWinRates[i],
            "morter2Rate":   morter2Rates[i],
            "winCount":      winCounts[i],
            "winRate":       winRates[i],
            "secondCount":   secondCounts[i],
            "secondRate":    secondRates[i],
        }
        racerData.append(d)
    
    return racerData

def getDetailHtml(ymd, kaijo, raceNo):
    payload = {'ymd': ymd, 'k': kaijo, 'r': raceNo}
    r = requests.post("http://kyotei.sakura.ne.jp/bangumi2.php", data=payload)
    return r.content

def getResultHtml(ymd, kaijo, raceNo):
    url = "http://boatrace.jp/owpc/pc/race/raceresult?rno=1&jcd=01&hd=20180906"
    r = requests.get(url)
    return r.content

def scrapeRaceData(ymd, kaijo, raceNo):
    detailHtml = getDetailHtml(ymd, kaijo, raceNo)
    racerData = selRacerData(detailHtml)
    
    resultHtml = getResultHtml(ymd, kaijo, raceNo)
    racerOrder, frameOrder = selRaceResult(resultHtml)
    
    return {
        "ymd": ymd,
        "kaijo": kaijo,
        "raceNo": raceNo,
        "racerData": racerData,
        "racerOrder": racerOrder,
        "frameOrder": frameOrder,
    }

if __name__ == '__main__':
    ymd = '20180921'
    kaijo = '06'
    raceNo = 1
    dat = scrapeRaceData(ymd, kaijo, raceNo)
    print(dat)


{'ymd': '20180921', 'kaijo': '06', 'raceNo': 1, 'racerData': [{'racerRegNos': '4757', 'ranks': 'A1', 'capabilities': '55', 'globalWinRates': '6.73', 'localWinRates': '6.98', 'morter2Rates': '30.09', 'winCounts': '44', 'winRates': '30.6', 'secondCounts': '27', 'secondRates': '18.8'}, {'racerRegNos': '4633', 'ranks': 'A2', 'capabilities': '52', 'globalWinRates': '5.20', 'localWinRates': '5.93', 'morter2Rates': '43.16', 'winCounts': '27', 'winRates': '20.3', 'secondCounts': '20', 'secondRates': '15'}, {'racerRegNos': '4579', 'ranks': 'A1', 'capabilities': '57', 'globalWinRates': '6.68', 'localWinRates': '6.77', 'morter2Rates': '33.93', 'winCounts': '41', 'winRates': '30.4', 'secondCounts': '27', 'secondRates': '20'}, {'racerRegNos': '4825', 'ranks': 'A2', 'capabilities': '53', 'globalWinRates': '5.70', 'localWinRates': '3.56', 'morter2Rates': '46.39', 'winCounts': '24', 'winRates': '20.5', 'secondCounts': '18', 'secondRates': '15.4'}, {'racerRegNos': '4812', 'ranks': 'A2', 'capabilities':