In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import json
import requests
import objectpath

from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta, date

## Beautiful Soup
### Get Match & Team Details (Upcoming Only)

In [2]:
url = "https://www.oddsportal.com/rugby-union/world/super-rugby/"
html = urlopen(url, timeout = 5)

In [3]:
soup = BeautifulSoup(html, 'lxml')

In [4]:
soup.find_all('tr',{'xeid':True})

[<tr class="odd" xeid="6JWQy8Gd"><td class="table-time datet t1557473700-1-1-0-0"></td><td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/blues-hurricanes-6JWQy8Gd/">Blues - Hurricanes</a></td><td class="odds-nowrp"><span>-</span></td><td class="odds-nowrp"><span>-</span></td><td class="odds-nowrp"><span>-</span></td><td class="center info-value"></td></tr>,
 <tr xeid="Aic7p0Kd"><td class="table-time datet t1557481500-1-1-0-0"></td><td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/rebels-queensland-reds-Aic7p0Kd/">Rebels - Reds</a></td><td class="odds-nowrp"><span>-</span></td><td class="odds-nowrp"><span>-</span></td><td class="odds-nowrp"><span>-</span></td><td class="center info-value"></td></tr>,
 <tr class="odd" xeid="Of1BqKZ2"><td class="table-time datet t1557508200-1-1-0-0"></td><td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/blue-bulls-crusaders-Of1BqKZ2/">Bulls - Cru

In [6]:
games = soup.find_all(attrs={'class':'name table-participant'})
games

[<td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/blues-hurricanes-6JWQy8Gd/">Blues - Hurricanes</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/rebels-queensland-reds-Aic7p0Kd/">Rebels - Reds</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/blue-bulls-crusaders-Of1BqKZ2/">Bulls - Crusaders</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/highlanders-jaguares-IH0Frvk9/">Highlanders - Jaguares</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/waikato-chiefs-sharks-d8aJsb4F/">Chiefs - Sharks</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/golden-lions-waratahs-EqfOtIJL/">Lions - Waratahs</a></td>,
 <td class="name table-participant" colspan="2"><a href="/rugby-union/world/super-rugby/act-brumbies-sunwolves-hz

In [7]:
home_team = []
away_team = []

In [8]:
for game in games:
    teams = str(game).split('>')[2].split('<')[0].split(' - ')
    home_team.append(teams[0])
    away_team.append(teams[1])

### Get dates of games

In [None]:
# this is too hard, resorting to giving all upcoming games the date of the nearest friday

In [9]:
onDay = lambda date, day: date + timedelta(days=(day-date.weekday()+7)%7)

In [10]:
gameday = onDay(datetime.today(), 4).strftime('%d-%B-%y')
gameday

'10-May-19'

## Use REST Service for Odds

In [11]:
s = requests.session()
request_headers = {
    'Referer': 'https://www.oddsportal.com/rugby-union/world/super-rugby/',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

In [12]:
get_odds = s.get('https://fb.oddsportal.com/ajax-sport-country-tournament/8/z36c0gcb/X0/1',
                  headers = request_headers)

In [13]:
text_odds = get_odds.text[72:-2]

In [14]:
json_odds = json.loads(text_odds)

In [15]:
json_tree = objectpath.Tree(json_odds['d'])

In [16]:
odds = tuple(json_tree.execute('$..avg'))
# odds all appear twice for some reason, so we take just half of them
odds = odds[:np.int(len(odds)/2)]

In [17]:
num_games = int(len(odds)/3)

In [18]:
num_games

7

In [19]:
one_list = []
x_list = []
two_list = []

In [20]:
# take every 3 rows and put them into 1 row
for i in range(len(odds)):
    if i%3==0:
        one_list.append(odds[i])
    if i%3==1:
        x_list.append(odds[i])
    if i%3==2:
        two_list.append(odds[i])

In [21]:
two_list[:num_games]

[1.43, 3.91, 1.27, 5.25, 2.46, 3.22, 5.28]

In [34]:
# concatenate info into dataframe
df = pd.DataFrame({'Date':[gameday]*num_games,
                   'Home Team':home_team,
                   'Away Team':away_team,
                   'Home Score':0,
                   'Away Score':0,
                   'Home Odds':one_list[:num_games],
                   'Draw Odds':x_list[:num_games],
                   'Away Odds':two_list[:num_games]})

In [35]:
df.head(num_games)

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Odds,Draw Odds,Away Odds
0,10-May-19,Blues,Hurricanes,0,0,2.87,22.18,1.43
1,10-May-19,Rebels,Reds,0,0,1.26,23.76,3.91
2,10-May-19,Bulls,Crusaders,0,0,3.81,24.48,1.27
3,10-May-19,Highlanders,Jaguares,0,0,1.16,30.18,5.25
4,10-May-19,Chiefs,Sharks,0,0,1.56,21.62,2.46
5,10-May-19,Lions,Waratahs,0,0,1.35,22.92,3.22
6,10-May-19,Brumbies,Sunwolves,0,0,1.15,30.38,5.28


In [32]:
dataset = pd.read_csv('https://raw.githubusercontent.com/kieranbd/superrugby-predictor/web-scraper/super_rugby_oddsportal.csv').drop('Play-off Game?', axis=1).dropna()

In [33]:
dataset.head()

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Odds,Draw Odds,Away Odds
0,3-May-19,Crusaders,Sharks,0.0,0.0,1.05,40.4,9.2
1,3-May-19,Reds,Sunwolves,0.0,0.0,1.3,24.04,3.59
2,4-May-19,Hurricanes,Rebels,0.0,0.0,1.22,27.73,4.29
3,4-May-19,Highlanders,Chiefs,0.0,0.0,1.23,26.59,4.17
4,4-May-19,Brumbies,Blues,0.0,0.0,2.03,20.48,1.82


### Get scores from previous week
## TODO: find best site to get scores from, update scores in df and push to GH

In [41]:
r = requests.session()
request_headers = {
    'Referer': 'https://www.oddsportal.com/rugby-union/world/super-rugby/results/',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

In [42]:
get_results = r.get('https://fb.oddsportal.com/ajax-sport-country-tournament-archive/8/z36c0gcb/X0/1/2/1/',
                  headers = request_headers)

In [68]:
url = "http://www.superrugby.co.nz/Grandstand"
html2 = urlopen(url, timeout = 5)

In [69]:
soup = BeautifulSoup(html2, 'lxml')

In [70]:
soup.find_all()

[<html class="no-js" lang="en" xmlns:fb="http://www.facebook.com/2008/fbml">
 <!--<![endif]-->
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
 <title>Fixtures and Results | superrugby.co.nz | Official home of the Investec Super Rugby competition</title>
 <meta name="description"/>
 <meta content="keywords" name="keywords"/>
 <meta content="false" http-equiv="imagetoolbar"/>
 <meta content="AllBlacks" name="apple-mobile-web-app-title"/>
 <!-- apple home screen title -->
 <meta content="AllBlacks" name="application-name"/>
 <meta content="A description of what this site does." name="msapplication-tooltip"/>
 <meta content="width=device-width, initial-scale=1, minimum-scale=1, user-scalable=no" name="viewport"/>
 <!-- controls page sizing and zooming on mobile devices -->
 <!-- For third-generation iPad with high-resolution Retina display: -->
 <!-- For iPhone with high-resolution Retina display: -->
 <link href="/apple-touch-icon-iphone4

### Output concatenated df to put on github

In [38]:
updated_df = pd.concat([df,dataset], axis=0)

In [66]:
updated_df.to_csv("super-rugby-oddsportal-new.csv",index=False)

In [None]:
# push to GH