In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime
import re

In [2]:
def strip_tags(soup, tags):
    for tag in tags:
        for match in soup.findAll(tag):
            match.replaceWithChildren()

soup = BeautifulSoup("""
    <td>
    <div align="center">W. Force</div>
    </td>
    <td>
    <div align="center">5</div>
    </td>
    <td>
    <div align="center">18/2/06</div>
    </td>
    """, 'html.parser' )          
strip_tags(soup, ["div", "invalid"])
print(soup)


<td>
W. Force
</td>
<td>
5
</td>
<td>
18/2/06
</td>



In [3]:
 ##### Read results from www.superxv.com into DataFrame
teams_1 = []
teams_2 = []
scores_1 = []
scores_2 = []
dates = []
for year in [str(y) for y in range(2006, 2018)]:
    url = 'http://www.superxv.com/results/{year}-super-rugby-results/'.format(year=year)
    page = requests.get(url)
    if page.status_code != 200:
        print('Error downloading {url}'.format(url=url))
    else:
        print('Successfully downloaded {url}'.format(url=url))
        for table in BeautifulSoup(page.content, 'html.parser').find_all('table'):
            strip_tags(table, ["div"])
            for row in table.find_all('tr')[1:]:
                entries = row.find_all('td')
                if entries[0].text.strip() not in ['Bye', 'Not playing' ,'']: # and entries[1] = '':
                    teams_1.append(entries[0].text.strip().replace('\n',' '))
                    teams_2.append(entries[3].text.strip().replace('\n',' '))
                    scores_1.append(entries[1].text.strip())
                    scores_2.append(entries[4].text.strip())
                    dates.append(entries[5].text.strip())
df = pd.DataFrame({
        'date':dates,
        'team_1':teams_1,
        'team_2':teams_2,
        'score_1':scores_1,
        'score_2':scores_2})

Successfully downloaded http://www.superxv.com/results/2006-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2007-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2008-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2009-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2010-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2011-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2012-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2013-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2014-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2015-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2016-super-rugby-results/
Successfully downloaded http://www.superxv.com/results/2017-super-rugby-results/


In [4]:
########### Clean and format dates
df.date[df.date == "29/3/088"] = "29/03/2008"
df.date[df.date == "00/05/09"] = "09/05/09"
df.date[df.date == "0907/16"] = "09/07/16"
df.date = pd.to_datetime(df.date)
df = df.drop_duplicates() # 2007-05-05	10	36	Stormers	Sharks is duplicated
df['year'] = df.date.dt.year
df.groupby('year').count()
df

Unnamed: 0,date,score_1,score_2,team_1,team_2,year
0,2006-10-02,19,37,Blues,Hurricanes,2006
1,2006-10-02,10,25,Force,Brumbies,2006
2,2006-10-02,18,30,Cheetahs,Bulls,2006
3,2006-11-02,38,15,Crusaders,Highlanders,2006
4,2006-11-02,12,16,Reds,Waratahs,2006
5,2006-11-02,12,23,Cats,Stormers,2006
6,2006-11-02,30,21,Sharks,Chiefs,2006
7,2006-02-17,25,13,Highlanders,Blues,2006
8,2006-02-17,21,16,Cats,Chiefs,2006
9,2006-02-17,21,27,Bulls,Brumbies,2006


In [7]:
#### Clean and format team names
pattern = "([A-Z][a-z]*)\W*$"
df.team_1 = df.team_1.str.extract(pattern)
df.team_2 = df.team_2.str.extract(pattern)

teams = pd.concat([df.team_1, df.team_2])
teams.groupby(teams).count().sort_values(ascending=False)

  app.launch_new_instance()


Crusaders      178
Sharks         171
Chiefs         171
Hurricanes     170
Bulls          170
Brumbies       169
Waratahs       169
Highlanders    167
Stormers       167
Reds           165
Blues          164
Cheetahs       162
Force          161
Lions          135
Rebels          96
Kings           32
Jaguares        16
Sunwolves       16
Cats            13
dtype: int64