# Scraping Rugby Data (All Blacks & Ireland matches Oct 2003-Nov 2018)

 We'll be scraping data for All Black matches and Ireland matches starting from early October 2003 to present (Dec, 2018). We choose October 2003 as the starting point for all our data because World Rugby rankings weren't introduced before this. Data we'll be scraping will also include Opposition Name, Date of Match, Result of Match, Number of Debutants in the Opposition, Number of Debutants in the All Blacks/Ireland, Opposition Tries in Last 5 Games leading up the match, All Black/Ireland Tries in the Last 5 Games leading up to the match, Opposition Rating on the day of match, All Black/ Ireland Rating on the day of the match and Number of Games Since Last Loss

In [63]:
import requests
import csv
import re
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from numpy import arange

page1 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;spanmax1=1st+Dec+201;spanmax2=1+Dec+2018;spanmin1=13+Oct+2003;spanmin2=13+Oct+2003;spanval1=span;spanval2=span;team=8;template=results;type=team;view=match"
page2 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=2;spanmax1=1st+Dec+201;spanmax2=1+Dec+2018;spanmin1=13+Oct+2003;spanmin2=13+Oct+2003;spanval1=span;spanval2=span;team=8;template=results;type=team;view=match"
page3 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=3;spanmax1=1st+Dec+201;spanmax2=1+Dec+2018;spanmin1=13+Oct+2003;spanmin2=13+Oct+2003;spanval1=span;spanval2=span;team=8;template=results;type=team;view=match"
page4 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=4;spanmax1=1st+Dec+201;spanmax2=1+Dec+2018;spanmin1=13+Oct+2003;spanmin2=13+Oct+2003;spanval1=span;spanval2=span;team=8;template=results;type=team;view=match"
page5 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=5;spanmax1=1st+Dec+201;spanmax2=1+Dec+2018;spanmin1=13+Oct+2003;spanmin2=13+Oct+2003;spanval1=span;spanval2=span;team=8;template=results;type=team;view=match"

startpos = 0

In [18]:
columns = ['Opposition Name','Date','Result','Location','Opposition Debutants','All Black Debutants','Opposition tries in last 5 games','All Black tries in last 5 games','Opposition Rating','All Black Rating','Games since last lost']
ABMatchData = pd.DataFrame(columns=columns,index=range(0,204))

In [21]:
#Populate First 4 Columns
def populate_first4col(url, endoftable, startpos, dateendindx, resultendpos, TeamMatchData):
    response = requests.get(url)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    
#extract HTML tags, only a portion of this list contains texts for opposition team name 
    AllItemsList = parser.find_all('td', class_="left")
    TextList = []
    OppName = []
    
#extract list of HTML tags for dates and convert to Series
    DatesList = []
    AllDatesList = []
    DatesList = parser.find_all('b')  
    DatesList = DatesList[4:dateendindx]
    for date in DatesList:
        AllDatesList.append(date.text)
    AllDatesSeries = pd.Series(AllDatesList)
    
#extract texts of opposition names from list of HTML tags
    pos = 2
    for item in AllItemsList:
        TextList.append(item.text)

    
#extract only the texts of opposition team name and convert to series    
    pos = 2
    for x in range(0,len(AllItemsList)):
        if pos == x:
            OppName.append(TextList[pos])
            pos = pos + 4

    OppNameSeries = pd.Series(OppName)
    
#extract list of HTML tags for result and convert to Series
    ResultList = []
    TextResultList = []
    ResultOnlyList = []
    ResultList = parser.find_all('td') 

    for result in ResultList:
        TextResultList.append(result.text)

#len(TextResultList)
    pos = 13
    #resultendpos = 700
    for x in range(0,len(TextResultList)):
         if ((x==pos)and(x<resultendpos)):
           # print(TextResultList[x])
            ResultOnlyList.append(TextResultList[pos])
            pos = pos + 14
        
    ResultSeries = pd.Series(ResultOnlyList)   
    pos = 0

#extract list of HTML tags for location and convert to Series
    GroundList = []
    TextGroundList = []
    GroundOnlyList = []
    GroundList = parser.find_all('a',class_='data-link') 
    for x in range(0,len(GroundList)):
        if((x%2)!=0):
            GroundOnlyList.append(GroundList[x].text)

    LocationSeries=pd.Series(GroundOnlyList)
    
    
#add Series of opposition names,dates,result and location to MatchData
    for x in range(startpos,endoftable):
        TeamMatchData.loc[x,'Opposition Name']=OppNameSeries[pos]
        TeamMatchData.loc[x,'Date']=AllDatesSeries[pos]
        TeamMatchData.loc[x,'Result']=ResultSeries[pos]
        TeamMatchData.loc[x,'Location']=LocationSeries[pos]
        pos = pos + 1
        
    pos = 0
    #print(ResultSeries)
        
    startpos = endoftable
    return startpos




In [None]:
#populate
startpos = populate_first4col(page1,50,startpos,54,700,ABMatchData)
startpos = populate_first4col(page2,100,startpos,54,700,ABMatchData)
startpos = populate_first4col(page3,150,startpos,54,700,ABMatchData)
startpos = populate_first4col(page4,200,startpos,54,700,ABMatchData)
startpos = populate_first4col(page5,205,startpos,9,70,ABMatchData)

#### Our first 4 columns for our All Black DF should now be filled

In [299]:
ABMatchData

 #### We need to clean the Opposition name column in order to use it to scrape for opposition tries in the last 5 games

In [25]:
#Clean Opposition Name column to make URL to scrape tries in last 5 games
for x in range(0,205):
    name = str(ABMatchData.loc[x,'Opposition Name'])
    cleanname=re.sub('v ','',name)
    ABMatchData.loc[x,'Opposition Name']=cleanname

#Get list of Unique Opponents
OppList = ABMatchData['Opposition Name'].unique()

#Make series with custom index for URL construction
OppList.sort()
OppList
index=[10,6,25,1,14,9,81,3,20,23,32,82,121,27,12,15,2,5,16,11,4]
OppSeries = pd.Series(OppList,index=index)

#Tries in Last 5 games URL parts
urlpart1 = 'http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;orderbyad=reverse;spanmax2='
urlpart2 = ''#format is: 17+Oct+2003
urlpart3 = ';spanmin2=1+Jan+2000;spanval2=span;team='
urlpart4 = ''#ints from OppSeries index 
urlpart5 = ';template=results;type=team;view=match'
accumtries = 0

#Make complete URL to scrape try data from previous 5 games
for x in range(0,205):
    AllTagText = []
    date=ABMatchData.loc[x,'Date']
    urlpart2=str(re.sub(' ','+',date))
    oppname=ABMatchData.loc[x,'Opposition Name']
    urlpart4=str(OppSeries[OppSeries==oppname].index[0])
    FullURL = urlpart1+urlpart2+urlpart3+urlpart4+urlpart5
#start scraping again with new URL
    time.sleep(10)
    response = requests.get(FullURL)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    AllTriesTagList = parser.find_all('td')
    for tag in AllTriesTagList:
        AllTagText.append(tag.text)
    trylocations=[28,42,56,70,84]
    for loc in trylocations:
        try:
            accumtries = accumtries + int(AllTagText[loc])
            ABMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
        except Exception:
            ABMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
    ABMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
        
    accumtries = 0
    

#### Now we do the exact same thing, just for the All Blacks instead of their opposition 

In [81]:
#Insert tries in last 5 games for All Blacks

#Tries in Last 5 games URL parts for All Blacks
urlpart1 = 'http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;orderbyad=reverse;spanmax2='
urlpart2 = ''#format is: 17+Oct+2003
urlpart3 = ';spanmin2=1+Jan+2000;spanval2=span;team=8;template=results;type=team;view=match'
accumtries = 0

#Make complete URL to scrape try data from previous 5 games for ABs
for x in ABMatchData.index:
    AllTagText = []
    date=ABMatchData.loc[x,'Date']
    urlpart2=str(re.sub(' ','+',date))
    FullURL = urlpart1+urlpart2+urlpart3
#start scraping again with new URL
    time.sleep(10)
    response = requests.get(FullURL)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    AllTriesTagList = parser.find_all('td')
    for tag in AllTriesTagList:
        AllTagText.append(tag.text)
    trylocations=[28,42,56,70,84]
    for loc in trylocations:
        try:
            accumtries = accumtries + int(AllTagText[loc])
            ABMatchData.loc[x,'All Black tries in last 5 games']=accumtries
        except:
            ABMatchData.loc[x,'All Black tries in last 5 games']=accumtries
    accumtries = 0
    

#### Our first 4 columns should now be filled along with Opposition and All Black tries in the last 5 games should be filled:

In [301]:
ABMatchData

#### Now we import debutant data and add it to our DF

In [41]:
#Import debutant data
ABDebutantData = pd.read_csv('ABDebutants.csv')


In [43]:
#Add debutant data to MatchData DF
ABMatchData['Opposition Debutants']=ABDebutantData['Opposition Debutants']
ABMatchData['All Black Debutants']=ABDebutantData['All Black Debutants']

#### Our debutant data should now be filled 

In [248]:
ABMatchData

#### Pior to our first entry in the all blacks match record, their last loss was on the 14th of June 2003, let us account for this 8 game gap(keep in mind that this loss occured during a period where the world rankings had not yet been introduced, this is why our data does not start from this loss):

In [231]:
#Account for loss that occured before our first match date:
lastloss = 8
for x in range(0,5):
    ABMatchData.loc[x,'Games since last loss']=lastloss
    lastloss = lastloss+1
    


#### Currently our Result column is storing strings, lets convert them to ints in order to fill our Games since last loss column
    
    


In [232]:
#Convert str results to ints
for x in ABMatchData.index:
        ABMatchData.loc[x,'Result'] = int(ABMatchData.loc[x,'Result'])
        
   

In [233]:
#Now we fill in games since last loss based on the results column
lastloss = 1
for x in range(5,205):
    if ABMatchData.loc[x-1,'Result'] >= 0:
        ABMatchData.loc[x, 'Games since last loss'] = lastloss
        lastloss = lastloss + 1
    if ABMatchData.loc[x-1,'Result'] < 0:
        lastloss = 1
        ABMatchData.loc[x,'Games since last loss'] = lastloss
        lastloss = lastloss + 1
        

#### Lets just do some house keeping and clear out the decimal points from our Games Since Last Loss column and then convert our date data from str to datetimes

In [234]:
#remove decimal points in Games since last loss to ints        
ABMatchData = ABMatchData.astype({'Games since last loss': int})

#Convert date column from str to datetime
ABMatchData['Date'] = pd.to_datetime(ABMatchData['Date'])

In [225]:
ABMatchData

#### Now we import our rankings

In [330]:
ABRankings = pd.read_csv('ABRankingData.csv')

In [331]:
ABMatchData['All Black Rating'] = ABRankings['All Black Rating']
ABMatchData['Opposition Rating'] = ABRankings['Opposition Rating']

### Our ABMatchData is now complete

In [305]:
ABMatchData

# **Now we do the exact same thing, just for <font color = green>Ireland</font>**

In [268]:
page1 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;spanmax2=1+Dec+2018;spanmin2=13+Oct+2003;spanval2=span;team=3;template=results;type=team;view=match"
page2 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=2;spanmax2=1+Dec+2018;spanmin2=13+Oct+2003;spanval2=span;team=3;template=results;type=team;view=match"
page3 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=3;spanmax2=1+Dec+2018;spanmin2=13+Oct+2003;spanval2=span;team=3;template=results;type=team;view=match"
page4 = "http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;page=4;spanmax2=1+Dec+2018;spanmin2=13+Oct+2003;spanval2=span;team=3;template=results;type=team;view=match"
startpos = 0

In [269]:
columns = ['Opposition Name','Date','Result','Location','Opposition Debutants','Ireland Debutants','Opposition tries in last 5 games','Ireland tries in last 5 games','Opposition Rating','Ireland Rating','Games since last loss']
IREMatchData = pd.DataFrame(columns=columns,index=range(0,171))


In [95]:
startpos = populate_first4col(page1,50,startpos,54,700,IREMatchData)
startpos = populate_first4col(page2,100,startpos,54,700,IREMatchData)
startpos = populate_first4col(page3,150,startpos,54,700,IREMatchData)
startpos = populate_first4col(page4,172,startpos,26,308,IREMatchData)

#### Our first 4 columns for IREMatchData should now be filled

In [307]:
#IREMatchData

#### Now we insert tries in last 5 games, we start by cleaning our opposition name column and preparing our URLs 

In [100]:
#Clean Opposition Name column to make URL to scrape tries in last 5 games for opposition
for x in IREMatchData.index:
    name = str(IREMatchData.loc[x,'Opposition Name'])
    cleanname=re.sub('v ','',name)
    IREMatchData.loc[x,'Opposition Name']=cleanname

#Get list of Unique Opponents
OppList = IREMatchData['Opposition Name'].unique()

#Make series with custom index for URL construction
OppList.sort()
OppList
index=[10,6,25,1,14,9,81,20,23,82,8,121,12,57,15,2,5,11,4]
OppSeries = pd.Series(OppList,index=index)

#Tries in Last 5 games URL parts
urlpart1 = 'http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;orderbyad=reverse;spanmax2='
urlpart2 = ''#format is: 17+Oct+2003
urlpart3 = ';spanmin2=1+Jan+2000;spanval2=span;team='
urlpart4 = ''#ints from OppSeries index 
urlpart5 = ';template=results;type=team;view=match'
accumtries = 0

In [104]:
#Make complete URL to scrape try data from previous 5 games for opposition
for x in IREMatchData.index:
    AllTagText = []
    date=IREMatchData.loc[x,'Date']
    urlpart2=str(re.sub(' ','+',date))
    oppname=IREMatchData.loc[x,'Opposition Name']
    urlpart4=str(OppSeries[OppSeries==oppname].index[0])
    FullURL = urlpart1+urlpart2+urlpart3+urlpart4+urlpart5
#start scraping again with new URL
    time.sleep(10)
    response = requests.get(FullURL)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    AllTriesTagList = parser.find_all('td')
    for tag in AllTriesTagList:
        AllTagText.append(tag.text)
    trylocations=[28,42,56,70,84]
    for loc in trylocations:
        try:
            accumtries = accumtries + int(AllTagText[loc])
            IREMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
        except Exception:
            IREMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
    IREMatchData.loc[x,'Opposition tries in last 5 games']=accumtries
        
    accumtries = 0
        

#### Opposition tries in last 5 games should now be filled

In [250]:
IREMatchData

#### Now we fill in Tries in the Last 5 games for Ireland

In [291]:
#Insert tries in last 5 games for Ireland

#Tries in Last 5 games URL parts for Ireland
urlpart1 = 'http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;orderby=date;orderbyad=reverse;spanmax2='
urlpart2 = ''#format is: 17+Oct+2003
urlpart3 = ';spanmin2=1+Jan+2000;spanval2=span;team=3;template=results;type=team;view=match'
accumtries = 0

#Make complete URL to scrape try data from previous 5 games for Ireland
#for x in IREMatchData.index:
for x in range(0,172):
    AllTagText = []
    date=IREMatchData.loc[x,'Date']
    urlpart2=str(re.sub(' ','+',date))
    FullURL = urlpart1+urlpart2+urlpart3
#start scraping again with new URL
    time.sleep(10)
    response = requests.get(FullURL)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    AllTriesTagList = parser.find_all('td')
    for tag in AllTriesTagList:
        AllTagText.append(tag.text)
    trylocations=[28,42,56,70,84]
    for loc in trylocations:
        try:
            accumtries = accumtries + int(AllTagText[loc])
            IREMatchData.loc[x,'Ireland tries in last 5 games']=accumtries
        except:
            IREMatchData.loc[x,'Ireland tries in last 5 games']=accumtries
    accumtries = 0

TypeError: expected string or bytes-like object

#### Now, Ireland Tries in the Last 5 games should also be filled:

In [278]:
IREMatchData

#### Similiar to the All Blacks, the Irish also suffered a loss before the first match in our data. Let us account for this

In [281]:
#Account for loss that occured before our first match date:
lastloss = 7
for x in range(0,3):
    IREMatchData.loc[x,'Games since last loss']=lastloss
    lastloss = lastloss+1

In [282]:
#Convert str results to ints
for x in IREMatchData.index:
        IREMatchData.loc[x,'Result'] = int(IREMatchData.loc[x,'Result'])
        

In [283]:
#Fill in rest of last loss data
lastloss = 1
for x in range(3,172):
    if IREMatchData.loc[x-1,'Result'] >= 0:
        IREMatchData.loc[x, 'Games since last loss'] = lastloss
        lastloss = lastloss + 1
    if IREMatchData.loc[x-1,'Result'] < 0:
        lastloss = 1
        IREMatchData.loc[x,'Games since last loss'] = lastloss
        lastloss = lastloss + 1

#### Lets do some quick house keeping before going any further

In [294]:
#remove decimal points         
IREMatchData = IREMatchData.astype({'Games since last loss': int})
IREMatchData = IREMatchData.astype({'Ireland tries in last 5 games': int})


#Convert date column from str to datetime
IREMatchData['Date'] = pd.to_datetime(IREMatchData['Date'])

#### Now we import our Debutant data

In [285]:
#import debutant data and assign to columns
IREDebutantData = pd.read_csv('IREDebutants.csv')
IREMatchData['Opposition Debutants'] = IREDebutantData['Opposition Debutants']
IREMatchData['Ireland Debutants'] = IREDebutantData['Ireland Debutants']

In [297]:
IREMatchData

#### Now we import our rating data

In [296]:
IRERankingData = pd.read_csv('IRERankingData.csv')
IREMatchData['Opposition Rating'] = IRERankingData['Opposition Rating']
IREMatchData['Ireland Rating'] = IRERankingData['Ireland Rating']

# Now our <font color = green>Irish</font> DF is complete