# Building a web scraper for ESPNscrum using BeautifulSoup4

In [1]:
from bs4 import BeautifulSoup
import requests

Thankfully the website is built so that the url corresponds to data queries

In [6]:
# TODO: write function that returns url 
page = 1
url = ('http://stats.espnscrum.com/statsguru/'
        +'rugby'
        +'/stats/index.html?'
        +'class=1;'
        +'filter=advanced;'
        +'orderby=date;'
        +'page='+page
        +'size=200;'
        +'spanmax2=1+Apr+2020;'
        +'spanmin2=22+May+1987;'
        +'spanval2=span;'
        +'team=8;'
        +'template=results;'
        +'type=team;'
        +'view=match'
      )
print(url)

response = requests.get(url)
print('.get() status: {response.status_code}')
page = response.text
del response

http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=date;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match
.get() status: {response.status_code}


In [8]:
soup = BeautifulSoup(page, "html5lib")

In [30]:
# search for <table class='engineTable'> [1]

match_list = (soup.find_all('table', class_='engineTable'))[1]

In [33]:
match_list

<table class="engineTable">
<caption>Match list</caption>
<thead>
 <tr class="headlinks">
  <th class="left" nowrap=""><a class="black-link" href="/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=team;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match" title="sort by team name">Team</a></th>
  <th class="left" nowrap=""><a class="black-link" href="/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=result;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match" title="sort by result of match">Result</a></th>
  <th nowrap=""><a class="black-link" href="/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=for;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match" title="sort by total points scored">For</a></th>
  <th nowrap=""><a class="black-link" href="/statsguru/rugby/stats/inde

## Get headers

In [90]:
# TODO: Write function that pulls headers

headers = []
headers = tuple(elem.text for elem in match_list.find('thead').find('tr').find_all('th'))
headers

('Team',
 'Result',
 'For',
 'Aga',
 'Diff',
 'Tries',
 'Conv',
 'Pens',
 'Drop',
 '',
 'Opposition',
 'Ground',
 'Match Date',
 '')

## Get data for one row

In [99]:
# TODO: write function that gets data from table as list of tuples

row = []
row = tuple(match.findNext().text for match in match_list.find('tbody').findNext('tr').find_all('td'))
row


('New Zealand',
 '70',
 '6',
 '+64',
 '12',
 '8',
 '2',
 '0',
 '',
 'v Italy',
 'Auckland',
 'Auckland',
 '22 May 1987',
 '')

# Let's put it all together
Thankfully the website is built in such a way that html queries get us to the page we need.

In [112]:
# URL creator

def getESPNScrumUrl(team, page, start_date = '22+May+1987', end_date = '1+Apr+2020', 
                    sport = 'rugby', sort_filter = 'advanced', order_by='date',
                    template = 'results', type_ = 'team', view = 'match'):
    '''
    Short 
    '''
    
    team = str(team)
    page = str(page)

    url = ('http://stats.espnscrum.com/statsguru/'
            +sport
            +'/stats/index.html?'
            +'class=1;'
            +'filter='+sort_filter+';'
            +'orderby='+order_by+';'
            +'page='+page+';'
            +'size=200;'
            +'spanmax2='+end_date+';'
            +'spanmin2='+start_date+';'
            +'spanval2=span;'
            +'team='+team+';'
            +'template='+template+';'
            +'type='+type_+';'
            +'view='+view
          )
    print(' '+url)
    return url

In [113]:
getESPNScrumUrl(team=8,page=1)

 http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=date;page=1;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match


'http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;filter=advanced;orderby=date;page=1;size=200;spanmax2=1+Apr+2020;spanmin2=22+May+1987;spanval2=span;team=8;template=results;type=team;view=match'

In [105]:
# TODO: create dictionary of teams to team_number
teams = {
    '1' : 'England',
    '2' : 'Scotland',
    '3' : 'Ireland',
    '4' : 'Wales',
    '5' : 'South Africa',
    '6' : 'Australia',
    '7' : 'Germany',
    '8' : 'New Zealand',
    '9' : 'France',
    '10': 'Argentina',
    '11': 'United States of America',
    '12': 'Romania',
    '13': 'Poland',
    '14': 'Fiji',
    '15': 'Samoa',
    '16': 'Tonga',
    #'17': 'Sri Lanka',
    '18': 'Spain',
    #'19': None,
    '20': 'Italy',
    '21': 'Belgium',
    '22': 'Netherlands',
    '23': 'Japan',
    '24': 'Morocco',
    '25': 'Canada',
    '26': 'Hong Kong',
    '27': 'Portugal',
    '28': 'Chile',
    '29': 'Uruguay',
    '30': 'Denmark',
    '31': 'Sweden',
    '56': 'Switzerland',
    '81': 'Georgia',
    '82': 'Namibia',
    '32': 'British and Irish Lions',
}
len(teams)

33

In [77]:
response = requests.get(url)
print('.get() status: {response.status_code}')
page = response.text
del response

# match_list = BeautifulSoup(page, "html5lib").find_all('table', class_='engineTable')[1]

# TODO: pull headers function

# TODO: write function to get rows


In [78]:
rows = [row for row in match_list.find('tbody').find_all('tr')]  # tr tag is for rows

In [None]:
for row in rows[1:6]:
    items = row.find_all('td')
    link = items[0].find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]