# Hypothesis: 

## More accidents at football stadiums on days of football matches

In [16]:
from pathlib import Path
import numpy as np
import pandas as pd

import scipy.stats as stats

In [28]:
# creating Path object for current working directory
directory = Path('./')
# creating Path object for additional data directory
additional_directory = directory / 'additional_data'
# create new directory for additional data
Path(additional_directory).mkdir(exist_ok=True)

# defining the directory to original data
directory = Path('./data/')
additional_directory = Path('./additional_data')

# list the .csv files for the project
for file in directory.glob('*.csv'):
    print(file)
    
# reading in .csv files to dataframes
vehicles = pd.read_csv(directory / 'vehicles2019.csv', dtype={'Accident_Index': str})
casualties = pd.read_csv(directory / 'casualties2019.csv', dtype={'Accident_Index': str})
# cleaned accidents DataFrame
accidents = pd.read_pickle('accidents_cleaned.pkl')

# convert column names to lowercase for ease of indexing
def lower_columns(df):
    """
    Defintion:
        convert column names to lower case
    """
    df.columns = map(str.lower, df.columns)
    
# converting all column names to lower case
lower_columns(vehicles)
lower_columns(casualties)

accidents.head(5)

data/vehicles2019.csv
data/accidents2019.csv
data/casualties2019.csv


Unnamed: 0,accident_index,longitude,latitude,police_force,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,local_authority_(district),local_authority_(highway),...,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,lsoa_of_accident_location,district,converted_date,converted_time,datetime,decimal_time,day_of_year
0,2019010152270,-0.127949,51.436208,1,3,2,1,3,9,E09000022,...,0,1,3,E01003117,lambeth,2019-01-15,21:45:00,2019-01-15 21:45:00,21.75,15
1,2019010157567,-0.123427,51.44931,1,3,2,2,3,9,E09000022,...,0,1,1,E01003023,lambeth,2019-01-15,08:42:00,2019-01-15 08:42:00,8.7,15
2,2019010157732,-0.145106,51.461256,1,2,1,2,3,9,E09000022,...,0,1,1,E01003026,lambeth,2019-01-15,07:08:00,2019-01-15 07:08:00,7.133333,15
3,2019010157896,-0.240823,51.533125,1,3,2,1,3,28,E09000005,...,0,1,1,E01000535,brent,2019-01-15,21:05:00,2019-01-15 21:05:00,21.083333,15
4,2019010157795,0.161736,51.550272,1,3,1,1,3,16,E09000002,...,0,1,1,E01000035,barking and dagenham,2019-01-15,16:10:00,2019-01-15 16:10:00,16.166667,15


In [20]:
def sphere_distance(s_lat, s_lng, e_lat, e_lng):
    R = 6373.0
    
    s_lat = s_lat*np.pi/180
    s_lng = np.deg2rad(s_lng)
    e_lat = np.deg2rad(e_lat)
    e_lng = np.deg2rad(e_lng)
    
    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2
    
    return 2 * R * np.arcsin(np.sqrt(d))

### Test case:

First, a test case shall be run for a football match at Old Trafford football stadium on 24/02/2019

In [21]:
# coordinates for Old Trafford football stadium in Manchester
manc = [53.457831502, -2.288165514]

In [22]:
acc_manc = accidents

# create feature of the distance (in km) of an accident to Old Trafford
acc_manc['dist_from_manc'] = sphere_distance(manc[0], manc[1], acc_manc['latitude'], acc_manc['longitude'])

# filter for those accidents within 5km radius
distance_mask = acc_manc.dist_from_manc < 5
acc_manc = acc_manc[distance_mask]
# filter for Sunday
sunday_mask = acc_manc['day_of_week'] == 1
sunday_manc = acc_manc[sunday_mask]

# group by day and count number of accidents
sundays = sunday_manc.groupby('converted_date')['accident_index'].count()

zscores = stats.zscore(sundays)
zscores['2019-02-24']

sundays.mean()

1.7692307692307692

The value of accidents on the Sunday of the football match is 1.3 standard deviations away from the average. This deserves further investigation.

In [24]:
football = pd.read_csv('additional_data/football_stats.csv')
football['datetime'] = pd.to_datetime(football['datetime'])
football['converted_date'] = football['datetime'].dt.date

acc_i = accidents.copy()
football = football.sort_values('day_of_year')
acc_i = acc_i.drop('datetime', axis=1)
football = football.drop('datetime', axis=1)

In [25]:
zscores_list = []
for i in range(football.shape[0]):
    
    coordinates = [football.loc[i, 'latitude'], football.loc[i,'longitude']]
    football_day = football.loc[i, 'day_of_year']
    football_day_of_week = football.loc[i, 'day_of_week']
    football_stadium = football.loc[i, 'stadium_name']
    
    acc_i = accidents.copy()
    
    # add distance from stadium as a feature
    acc_i['dist_from_stadium'] = sphere_distance(coordinates[0], coordinates[1],
                                                 acc_i['latitude'], acc_i['longitude'])

    # filter for those accidents within 5km radius of the stadium
    distance_mask = acc_i['dist_from_stadium'] < 10
    # filter for that day of the week
    day_of_week_mask = acc_i['day_of_week'] == football_day_of_week

    final = acc_i[distance_mask & day_of_week_mask]
    

    final = final.groupby('day_of_year')['accident_index'].count()
    
    zscores = stats.zscore(final)
    mean = final.mean()
    
    if football_day in zscores.index:
        zscore = zscores[football_day]
        accidents_on_day = final[football_day]
    else:
        zscore = 0
        accidents_on_day = 0
    
    info = {
        'Day of match': football_day,
        'Stadium': football_stadium,
        'Accidents on day of match': accidents_on_day,
        'Mean # Accidents in area': mean,
        'z_score': zscore
    }
    
        
    zscores_list.append(info)

In [26]:
a = pd.DataFrame(zscores_list)
a

Unnamed: 0,Day of match,Stadium,Accidents on day of match,Mean # Accidents in area,z_score
0,1,Goodison Park,3,4.264151,-0.670671
1,13,Goodison Park,4,3.080000,0.665062
2,33,Goodison Park,1,3.230769,-1.274802
3,37,Goodison Park,3,4.211538,-0.676716
4,62,Goodison Park,3,3.080000,-0.057831
...,...,...,...,...,...
121,93,Tottenham Hotspur,18,17.538462,0.104265
122,103,Tottenham Hotspur,15,17.038462,-0.411199
123,113,Tottenham Hotspur,19,16.773585,0.514289
124,117,Tottenham Hotspur,14,17.038462,-0.612919


In [27]:
a.describe()

Unnamed: 0,Day of match,Accidents on day of match,Mean # Accidents in area,z_score
count,126.0,126.0,126.0,126.0
mean,70.31746,7.420635,7.98243,-0.057616
std,40.084241,7.609839,7.21575,0.817756
min,1.0,0.0,1.0,-2.123445
25%,33.25,2.0,2.069183,-0.658984
50%,68.0,4.0,4.239919,0.0
75%,108.75,14.0,14.548077,0.359493
max,132.0,29.0,24.653846,2.885372
