# Clustering

In [11]:
from pathlib import Path
import numpy as np
import pandas as pd
import datetime

import scipy.stats as stats

In [12]:
# creating Path object for current working directory
directory = Path('./')
# creating Path object for additional data directory
additional_directory = directory / 'additional_data'
# create new directory for additional data
Path(additional_directory).mkdir(exist_ok=True)

# defining the directory to original data
directory = Path('./data/')
additional_directory = Path('./additional_data')

# list the .csv files for the project
for file in directory.glob('*.csv'):
    print(file)
    
# reading in .csv files to dataframes
vehicles = pd.read_csv(directory / 'vehicles2019.csv', dtype={'Accident_Index': str})
casualties = pd.read_csv(directory / 'casualties2019.csv', dtype={'Accident_Index': str})
# cleaned accidents DataFrame
accidents = pd.read_pickle('accidents_cleaned.pkl')

# convert column names to lowercase for ease of indexing
def lower_columns(df):
    """
    Defintion:
        convert column names to lower case
    """
    df.columns = map(str.lower, df.columns)
    
# converting all column names to lower case
lower_columns(vehicles)
lower_columns(casualties)

accidents.head(5)

data/vehicles2019.csv
data/accidents2019.csv
data/casualties2019.csv


Unnamed: 0,accident_index,longitude,latitude,police_force,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,local_authority_(district),local_authority_(highway),...,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,lsoa_of_accident_location,district,converted_date,converted_time,datetime,decimal_time,day_of_year
0,2019010152270,-0.127949,51.436208,1,3,2,1,3,9,E09000022,...,0,1,3,E01003117,lambeth,2019-01-15,21:45:00,2019-01-15 21:45:00,21.75,15
1,2019010157567,-0.123427,51.44931,1,3,2,2,3,9,E09000022,...,0,1,1,E01003023,lambeth,2019-01-15,08:42:00,2019-01-15 08:42:00,8.7,15
2,2019010157732,-0.145106,51.461256,1,2,1,2,3,9,E09000022,...,0,1,1,E01003026,lambeth,2019-01-15,07:08:00,2019-01-15 07:08:00,7.133333,15
3,2019010157896,-0.240823,51.533125,1,3,2,1,3,28,E09000005,...,0,1,1,E01000535,brent,2019-01-15,21:05:00,2019-01-15 21:05:00,21.083333,15
4,2019010157795,0.161736,51.550272,1,3,1,1,3,16,E09000002,...,0,1,1,E01000035,barking and dagenham,2019-01-15,16:10:00,2019-01-15 16:10:00,16.166667,15


In [13]:
def group_by_day(df, feature='accident_index', is_categorical=False, function='sum'):
    new_df = df.loc[:, ['day_of_week', feature]]
    if (is_categorical):
        one_hot = pd.get_dummies(new_df.loc[:, feature])
        new_df = pd.concat([new_df, one_hot], axis=1)
    grouped = new_df.groupby('day_of_week')
    if (function == 'sum'):
        return grouped.sum()
    elif (function == 'count'):
        return grouped.count()

def group_by_time(df, freq, feature='accident_index', is_categorical=False, function='sum'):
    new_df = df.loc[:, ['converted_time', feature]]
    date = str(datetime.datetime.strptime('2018-01-01', '%Y-%m-%d').date())
    new_df['converted_time'] = pd.to_datetime(date + " " + new_df.converted_time.astype(str))
    if (is_categorical):
        one_hot = pd.get_dummies(new_df.loc[:, feature])
        new_df = pd.concat([new_df, one_hot], axis=1)
    grouped = new_df.groupby(pd.Grouper(key='converted_time', freq=freq))
    if (function == 'sum'):
        return grouped.sum()
    elif (function == 'count'):
        return grouped.count()

In [14]:
accidents_by_hour = group_by_time(accidents, freq='H', function='count')
accidents_by_hour['zscore'] = stats.zscore(accidents_by_hour['accident_index'])

accidents_by_hour

Unnamed: 0_level_0,accident_index,zscore
converted_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,1211,-1.01482
2018-01-01 01:00:00,850,-1.213492
2018-01-01 02:00:00,641,-1.328512
2018-01-01 03:00:00,534,-1.387398
2018-01-01 04:00:00,514,-1.398405
2018-01-01 05:00:00,704,-1.293841
2018-01-01 06:00:00,1382,-0.920713
2018-01-01 07:00:00,3235,0.099061
2018-01-01 08:00:00,4966,1.051693
2018-01-01 09:00:00,3311,0.140886


In [15]:
accidents_by_day = group_by_day(accidents, function='count')
accidents_by_day['zscore'] = stats.zscore(accidents_by_day['accident_index'])

accidents_by_day

Unnamed: 0_level_0,accident_index,zscore
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8075,-2.070046
2,10326,-0.127937
3,10939,0.400944
4,11085,0.52691
5,11252,0.670993
6,11868,1.202463
7,9775,-0.603327


In [16]:
# merge the accidents and vehicle datasets
by_vehicle = pd.merge(accidents, vehicles, on='accident_index')
# motorcycle accidents by hour
motorcycle_by_hour = group_by_time(by_vehicle, 'H', 'vehicle_type', True).loc[:, 2:5]

motorcycle_by_hour = pd.DataFrame(motorcycle_by_hour.sum(axis=1))
motorcycle_by_hour['zscore'] = stats.zscore(motorcycle_by_hour[0])

motorcycle_by_hour

Unnamed: 0_level_0,0,zscore
converted_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,137.0,-1.053351
2018-01-01 01:00:00,74.0,-1.267708
2018-01-01 02:00:00,62.0,-1.308538
2018-01-01 03:00:00,30.0,-1.417417
2018-01-01 04:00:00,44.0,-1.369782
2018-01-01 05:00:00,80.0,-1.247293
2018-01-01 06:00:00,204.0,-0.825385
2018-01-01 07:00:00,504.0,0.195359
2018-01-01 08:00:00,593.0,0.49818
2018-01-01 09:00:00,380.0,-0.226549


In [17]:
# motorcycle accidents by day of the week
motorcycle_by_day = group_by_day(by_vehicle, 'vehicle_type', True).loc[:, 2:5]
motorcycle_by_day = pd.DataFrame(motorcycle_by_day.sum(axis=1))
motorcycle_by_day['zscore'] = stats.zscore(motorcycle_by_day[0])

motorcycle_by_day

Unnamed: 0_level_0,0,zscore
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1316.0,-1.455754
2,1425.0,-0.718211
3,1563.0,0.21556
4,1592.0,0.411787
5,1620.0,0.601248
6,1792.0,1.765078
7,1410.0,-0.819708
