In [8]:
import numpy as np
import pandas as pd

import csv
import regex as re

from pathlib import Path

import requests
from bs4 import BeautifulSoup

In [9]:
directory = Path('./')

new_data_directory = directory / 'additional_data'

Path(new_data_directory).mkdir(exist_ok=True)

# Football Statistics

In [12]:
# url containing football ground data
url = 'https://www.doogal.co.uk/FootballStadiums.php'
r = requests.get(url)

# instantiate BeautifulSoup object
soup = BeautifulSoup(r.content, 'html.parser')

# find table
table = soup.find('table', 
                   attrs = {'class': 'sortable stadiumsTable table table-striped table-hover'})
data = []

body = table.find('tbody')
tds = body.find_all('td')
for td in tds:
    data.append(td.get_text())

# convert to numpy.ndarray and reshape
data = np.array(data).reshape(-1,5)
# # convert back to standard python list
data = data.tolist()

# add headers
data.insert(0, ['stadium_name', 'team',
                'capacity', 'latitude', 'longitude'])

with open(new_data_directory / 'football_stadiums.csv', 'w') as f:
    write = csv.writer(f)
    for row in data:
        write.writerow(row)
        
stadiums = pd.read_csv(new_data_directory / 'football_stadiums.csv')

stadiums = stadiums[['stadium_name', 'latitude', 'longitude']]

stadiums

Unnamed: 0,stadium_name,latitude,longitude
0,Abbey Stadium,52.2128,0.154298
1,Adams Park,51.6306,-0.800299
2,Alexandra Stadium,53.0875,-2.435690
3,Almondvale Stadium,55.8864,-3.522070
4,Amex Stadium,50.8609,-0.080140
...,...,...,...
160,Wembley,51.5559,-0.279543
161,Weston Homes Community Stadium,51.9234,0.897861
162,Whaddon Road,51.9062,-2.060210
163,Wilgar Park,54.5995,-5.878240


In [14]:
fixtures = pd.read_csv(new_data_directory / 'fixtures2019.csv')

fixtures.sort_values('attendance', ascending=False)

fixtures['datetime'] = pd.to_datetime(fixtures['date_GMT'])

football = fixtures[['datetime', 'attendance',
                'home_team_name', 'away_team_name',
                'stadium_name']]

In [15]:
print(f'fixtures dataframe stadium names: \n\n{fixtures["stadium_name"].unique()} \n\n\n')
print(f'stadiums dataframe stadium names: \n\n{stadiums["stadium_name"].unique()}')

fixtures dataframe stadium names: 

['Old Trafford (Manchester)' "St. James' Park (Newcastle upon Tyne)"
 'Vitality Stadium (Bournemouth- Dorset)' 'Craven Cottage (London)'
 "John Smith's Stadium (Huddersfield- West Yorkshire)"
 'Vicarage Road (Watford)'
 'Molineux Stadium (Wolverhampton- West Midlands)' 'Anfield (Liverpool)'
 "St. Mary's Stadium (Southampton- Hampshire)" 'Emirates Stadium (London)'
 'Cardiff City Stadium (Cardiff (Caerdydd))' 'Goodison Park (Liverpool)'
 'King Power Stadium (Leicester- Leicestershire)'
 'Wembley Stadium (London)' 'London Stadium (London)'
 'Stamford Bridge (London)' 'Turf Moor (Burnley)'
 'Etihad Stadium (Manchester)'
 'The American Express Community Stadium (Falmer- East Sussex)'
 'Selhurst Park (London)' 'Tottenham Hotspur Stadium (London)'] 



stadiums dataframe stadium names: 

['Abbey Stadium' 'Adams Park' 'Alexandra Stadium' 'Almondvale Stadium'
 'Amex Stadium' 'Anfield' 'Ashton Gate' 'B2net Stadium'
 'Ballymena Showgrounds' 'Balmoor' 'Bangor F

In [16]:
def regex_fn(x):
    return re.sub('\([^)]*\)*','', str(x))

football = football[football['datetime'].dt.year == 2019]

football.loc[:, 'stadium_name'] = football.loc[:, 'stadium_name'].apply(regex_fn)
football.loc[:, 'stadium_name'] = football.loc[:, 'stadium_name'].str.replace('Stadium', '')
football.loc[:, 'stadium_name'] = football.loc[:, 'stadium_name'].str.strip()

stadiums.loc[:, 'stadium_name'] = stadiums.loc[:, 'stadium_name'].str.replace('Stadium', '').str.strip()



football['stadium_name'].unique()

array(['Goodison Park', 'Emirates', 'Cardiff City', 'Stamford Bridge',
       "John Smith's", 'London', 'Molineux', 'Vitality',
       "St. James' Park", 'Etihad', 'The American Express Community',
       'Turf Moor', 'Selhurst Park', 'King Power', 'Wembley', 'Anfield',
       'Old Trafford', "St. Mary's", 'Vicarage Road', 'Craven Cottage',
       'Tottenham Hotspur'], dtype=object)

In [17]:
# determining most important stadiums with slightly different
# string values
fs = set(football['stadium_name'])
ss = set(stadiums['stadium_name'])

fs | ss - (fs & ss)

{'Abbey',
 'Adams Park',
 'Alexandra',
 'Almondvale',
 'Amex',
 'Anfield',
 'Ashton Gate',
 'B2net',
 'Ballymena Showgrounds',
 'Balmoor',
 'Bangor Fuels Arena',
 'Bayview',
 'Belfast Loughshore Hotel Arena',
 'Bescot',
 'Bloomfield Road',
 'Bootham Crescent',
 'Borough Briggs',
 'Boundary Park',
 'Bramall Lane',
 'Brandywell',
 'Breda Park',
 'Brisbane Road',
 'Britannia',
 'Broadfield',
 'Broadwood',
 'Brunton Park',
 'Cappielow Park',
 'Cardiff City',
 'Carrow Road',
 'Celtic Park',
 'Central Park',
 'Cliftonhill',
 'Coleraine Showgrounds',
 'Craven Cottage',
 'Crown Ground',
 'DW',
 'Darragh Park',
 'Deepdale',
 'Dens Park',
 'Dixon Park',
 'Dumbarton Football',
 'East End Park',
 'Easter Road',
 'Edgar Street',
 'Elland Road',
 'Emirates',
 'Etihad',
 'Ewood Park',
 'Falkirk',
 'Ferney Park',
 'Field Mill',
 'Fir Park',
 'Firhill',
 'Forthbank',
 'Fratton Park',
 'Galabank',
 'Gayfield Park',
 'Gigg Lane',
 'Glanford Park',
 'Glebe Park',
 'Globe Arena',
 'Goodison Park',
 'Griffi

In [36]:
football = football.sort_values('attendance', ascending=False)
football = football.sort_values('datetime')

football = football.merge(stadiums)

def to_day_of_year(datetime_val):
    return datetime_val.dayofyear

def day_of_week(datetime_val):
    """Convert day of week values from Monday=0, Sunday=6,
       to Sunday=1, Monday=6"""
    return (datetime_val.dayofweek + 1) % 7 + 1

football['day_of_year'] = football['datetime'].apply(to_day_of_year)
football['day_of_week'] = football['datetime'].apply(day_of_week)

football

Unnamed: 0,datetime,attendance,home_team_name,away_team_name,stadium_name,latitude,longitude,day_of_year,day_of_week
0,2019-01-01 12:30:00,39052,Everton,Leicester City,Goodison Park,53.4387,-2.966190,1,3
1,2019-01-13 14:15:00,38113,Everton,AFC Bournemouth,Goodison Park,53.4387,-2.966190,13,1
2,2019-02-02 15:00:00,39380,Everton,Wolverhampton Wanderers,Goodison Park,53.4387,-2.966190,33,7
3,2019-02-06 19:45:00,39322,Everton,Manchester City,Goodison Park,53.4387,-2.966190,37,4
4,2019-03-03 16:15:00,39335,Everton,Liverpool,Goodison Park,53.4387,-2.966190,62,1
...,...,...,...,...,...,...,...,...,...
121,2019-04-03 18:45:00,59215,Tottenham Hotspur,Crystal Palace,Tottenham Hotspur,51.6044,-0.066389,93,4
122,2019-04-13 11:30:00,58308,Tottenham Hotspur,Huddersfield Town,Tottenham Hotspur,51.6044,-0.066389,103,7
123,2019-04-23 18:45:00,56251,Tottenham Hotspur,Brighton & Hove Albion,Tottenham Hotspur,51.6044,-0.066389,113,3
124,2019-04-27 11:30:00,60043,Tottenham Hotspur,West Ham United,Tottenham Hotspur,51.6044,-0.066389,117,7


In [37]:
football.to_csv(new_data_directory / 'football_stats.csv',
               index=False)