In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pandas as pd

from team_names import seria_a

In [2]:
# Use Chrome to scrape the following url
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# Url for match historical odds-movements from Bet365
url= 'http://www.310win.com/info/oddshistory.aspx?id=87215245&r1=%u5F17%u6D1B%u897F%u8BFA%u5C3C&r2=%u5207%u6C83&Company=Bet%20365'

# Navigate to url
browser.visit(url)

In [4]:
week = 38
year = 2019
match_time = "05-25 10:59"
gc = "Bet365"

In [5]:
# dict to store columns of win, draw, loss odds for home team as well as update time
dict_im = {0:[], 1:[], 2:[], -1:[]}

# View the keys for "dict_im"
list(dict_im.keys())

[0, 1, 2, -1]

In [6]:
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(browser.html, 'html.parser')

# Find the table
table = soup.find('table')

# Retrieve all rows from tbody
rows = table.tbody.find_all('tr')

# Loop through each row (except the header)
# Note that there is no "th" in the table
for i in range(len(rows)):      
    
    # Find all "td"
    tds = rows[i].find_all('td')
    
    # Retrieve the names of home and away teams from the "header" line
    if i == 0:
        home = tds[0].text
        away = tds[2].text
    # Scrape odds as well as time features and append to corresponding keys of "dict_im"    
    else:           
        [dict_im[key].append(tds[key].text) for key in list(dict_im.keys())]
        
# View "dict_im"
dict_im

{0: ['2.00',
  '2.05',
  '2.10',
  '2.20',
  '2.30',
  '2.20',
  '2.37',
  '2.30',
  '2.40',
  '2.40',
  '2.30',
  '2.45',
  '2.30',
  '2.25',
  '2.30'],
 1: ['3.75',
  '3.75',
  '3.75',
  '3.80',
  '3.75',
  '3.60',
  '3.60',
  '3.75',
  '3.75',
  '3.75',
  '3.75',
  '3.75',
  '3.75',
  '3.75',
  '3.75'],
 2: ['3.40',
  '3.30',
  '3.25',
  '3.00',
  '2.80',
  '3.00',
  '2.80',
  '2.87',
  '2.70',
  '2.62',
  '2.80',
  '2.62',
  '2.80',
  '2.87',
  '2.75'],
 -1: ['\xa0 05-25 23:35',
  '\xa0 05-25 22:16',
  '\xa0 05-25 19:25',
  '\xa0 05-24 17:53',
  '\xa0 05-23 23:10',
  '\xa0 05-23 22:17',
  '\xa0 05-23 18:01',
  '\xa0 05-23 16:06',
  '\xa0 05-23 12:02',
  '\xa0 05-23 04:45',
  '\xa0 05-23 01:37',
  '\xa0 05-21 22:33',
  '\xa0 05-21 21:37',
  '\xa0 05-21 19:19',
  '\xa0 05-21 08:26(初盘)']}

In [7]:
# Remove "\xa" in "dict_im[-1]" list
for i in range(len(dict_im[-1])):
    dict_im[-1][i] = dict_im[-1][i][2:]

# Remove unnecessary Chinese character for the last date in "dict_im[-1]" (date of the original odds)
dict_im[-1][-1] = dict_im[-1][-1][:-4]

# View "dict_im[-1]"
dict_im[-1]

['05-25 23:35',
 '05-25 22:16',
 '05-25 19:25',
 '05-24 17:53',
 '05-23 23:10',
 '05-23 22:17',
 '05-23 18:01',
 '05-23 16:06',
 '05-23 12:02',
 '05-23 04:45',
 '05-23 01:37',
 '05-21 22:33',
 '05-21 21:37',
 '05-21 19:19',
 '05-21 08:26']

In [8]:
# Convert "match_time" to timestamps
match_time = pd.to_datetime(str(year) + "-" + match_time, format="%Y-%m-%d %H:%M", errors="ignore")

In [9]:
# List to store time towards the start of match for each odds record
odds_time_list = []

In [10]:
# Loop through all times in "dict_im[-1]"
for dt in dict_im[-1]:
    
    # Convert odds time to timestamps
    odds_time = pd.to_datetime(str(year) + "-" + dt, format="%Y-%m-%d %H:%M", errors="ignore")
    
    # Note that "match_time" is scrapped as Central (Daylight/Standard) Time (GMT-5 for CDT or GMT-6 for CST)
    # Also note that "odds_time" is scarpped as China Standard Time (GMT+8)
    delta_min = (match_time - odds_time + pd.Timedelta(hours=13)).total_seconds() / 60
    
    # Append minutes for odds towards starting of the match to "odds_time_list"
    odds_time_list.append(delta_min)

# View "odds_time_list"
odds_time_list    

[24.0,
 103.0,
 274.0,
 1806.0,
 2929.0,
 2982.0,
 3238.0,
 3353.0,
 3597.0,
 4034.0,
 4222.0,
 5846.0,
 5902.0,
 6040.0,
 6693.0]

In [11]:
# Pandas DataFrame for scrapped data
match = pd.DataFrame({
    "win": dict_im[0],
    "draw": dict_im[1],
    "lose": dict_im[2],
    "odds_time_minutes": odds_time_list
})

# Preview "match"
match.head()

Unnamed: 0,win,draw,lose,odds_time_minutes
0,2.0,3.75,3.4,24.0
1,2.05,3.75,3.3,103.0
2,2.1,3.75,3.25,274.0
3,2.2,3.8,3.0,1806.0
4,2.3,3.75,2.8,2929.0


In [12]:
# Close the browser
browser.quit()

In [13]:
# Save "match" as csv file
match.to_csv(f"../data/wk{week}_{seria_a[home]}_{seria_a[away]}_{gc}.csv", index=False, header=True)