# Clustering

## Packages

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import datetime

import scipy.stats as stats
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

## Directory  creation and file management

In [2]:
# creating Path object for current working directory
cwd = Path('./')
root = cwd.resolve().parent
# creating Path object for additional data directory
additional_data_dir = root / 'additional_data'
# create new directory for additional data
Path(additional_data_dir).mkdir(exist_ok=True)

# creating Path object for plots directroy
plots_dir = root / 'plots'
# create new directory for plots
Path(plots_dir).mkdir(exist_ok=True)

# defining the directory to original data
data_dir = root / 'data'
additional_directory = root / 'additional_data'

# list the .csv files for the project
for file in data_dir.glob('*.csv'):
    print(file)
    
# reading in .csv files to dataframes
vehicles = pd.read_csv(data_dir / 'vehicles2019.csv', dtype={'Accident_Index': str})
casualties = pd.read_csv(data_dir / 'casualties2019.csv', dtype={'Accident_Index': str})
# cleaned accidents DataFrame
accidents = pd.read_pickle(additional_data_dir / 'accidents_cleaned.pkl')

# convert column names to lowercase for ease of indexing
def lower_columns(df):
    """
    Defintion:
        convert column names to lower case
    """
    df.columns = map(str.lower, df.columns)
    
# converting all column names to lower case
lower_columns(vehicles)
lower_columns(casualties)

accidents.head(5)

/home/jake/Documents/TrafficAccidents/data/vehicles2019.csv
/home/jake/Documents/TrafficAccidents/data/accidents2019.csv
/home/jake/Documents/TrafficAccidents/data/casualties2019.csv


Unnamed: 0,accident_index,longitude,latitude,police_force,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,local_authority_(district),local_authority_(highway),...,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,lsoa_of_accident_location,district,converted_date,converted_time,datetime,decimal_time,day_of_year
0,2019010152270,-0.127949,51.436208,1,3,2,1,3,9,E09000022,...,0,1,3,E01003117,lambeth,2019-01-15,21:45:00,2019-01-15 21:45:00,21.75,15
1,2019010157567,-0.123427,51.44931,1,3,2,2,3,9,E09000022,...,0,1,1,E01003023,lambeth,2019-01-15,08:42:00,2019-01-15 08:42:00,8.7,15
2,2019010157732,-0.145106,51.461256,1,2,1,2,3,9,E09000022,...,0,1,1,E01003026,lambeth,2019-01-15,07:08:00,2019-01-15 07:08:00,7.133333,15
3,2019010157896,-0.240823,51.533125,1,3,2,1,3,28,E09000005,...,0,1,1,E01000535,brent,2019-01-15,21:05:00,2019-01-15 21:05:00,21.083333,15
4,2019010157795,0.161736,51.550272,1,3,1,1,3,16,E09000002,...,0,1,1,E01000035,barking and dagenham,2019-01-15,16:10:00,2019-01-15 16:10:00,16.166667,15


## Grouping by day of the week and time of day

In [3]:
def group_by_day(df, feature='accident_index', is_categorical=False, function='sum'):
    new_df = df.loc[:, ['day_of_week', feature]]
    if (is_categorical):
        one_hot = pd.get_dummies(new_df.loc[:, feature])
        new_df = pd.concat([new_df, one_hot], axis=1)
    grouped = new_df.groupby('day_of_week')
    if (function == 'sum'):
        return grouped.sum()
    elif (function == 'count'):
        return grouped.count()

def group_by_time(df, freq, feature='accident_index', is_categorical=False, function='sum'):
    new_df = df.loc[:, ['converted_time', feature]]
    date = str(datetime.datetime.strptime('2018-01-01', '%Y-%m-%d').date())
    new_df['converted_time'] = pd.to_datetime(date + " " + new_df.converted_time.astype(str))
    if (is_categorical):
        one_hot = pd.get_dummies(new_df.loc[:, feature])
        new_df = pd.concat([new_df, one_hot], axis=1)
    grouped = new_df.groupby(pd.Grouper(key='converted_time', freq=freq))
    if (function == 'sum'):
        return grouped.sum()
    elif (function == 'count'):
        return grouped.count()

In [6]:
# merge the accidents and vehicles datasets
accidents_vehicles = pd.merge(accidents, vehicles, on='accident_index')
accidents_vehicles.columns

Index(['accident_index', 'longitude', 'latitude', 'police_force',
       'accident_severity', 'number_of_vehicles', 'number_of_casualties',
       'day_of_week', 'local_authority_(district)',
       'local_authority_(highway)', '1st_road_class', '1st_road_number',
       'road_type', 'speed_limit', 'junction_detail', 'junction_control',
       '2nd_road_class', '2nd_road_number',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards',
       'urban_or_rural_area', 'did_police_officer_attend_scene_of_accident',
       'lsoa_of_accident_location', 'district', 'converted_date',
       'converted_time', 'datetime', 'decimal_time', 'day_of_year',
       'vehicle_reference', 'vehicle_type', 'towing_and_articulation',
       'vehicle_manoeuvre', 'vehicle_location-restricted_lane',
       'junction_location', 'skidding_and

### Grouping the aggregated accidents by day of the week

In [7]:
# grouping by the day of the week
accidents_by_day = group_by_day(accidents_vehicles, feature='accident_index', function='count')
accidents_by_day['zscore'] = stats.zscore(accidents_by_day['accident_index'])
accidents_by_day.columns = ['accident_count', 'z-score']

accidents_by_day

Unnamed: 0_level_0,accident_count,z-score
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
1,14859,-2.026469
2,18830,-0.160009
3,20025,0.401668
4,20392,0.574166
5,20633,0.687442
6,21727,1.201646
7,17727,-0.678444


## Grouping the accidents by day of the week by vehicle type

In [None]:
vehicle_type_ref = pd.read_csv(additional_data_dir / 'vehicle_type.csv')
labels = vehicle_type_ref.set_index('code').to_dict()['label']

labels

In [9]:
# vehicle_by_day = group_by_day(by_vehicle, 'vehicle_type', True)
# vehicle_by_day = vehicle_by_day.drop('vehicle_type', axis=1)

# new_cols = []
# for col in vehicle_by_day.columns:
#     new_cols.append(labels[col])

# vehicle_by_day.columns = new_cols

# vehicle_by_day

### Create a dictionary of form { vehicle_type: df(accident_stats) }

In [17]:
by_vehicle = {}

def type_by_day(vehicles_by_day, accidents_by_day):
    """
    Creates a dictionary of the form { vehicle_type: df(accident_stats) }
    for all vehicle_types present in the dataset.
    """
    for col in vehicles_by_day.columns:
        vehicle_df = pd.DataFrame(vehicles_by_day.loc[:, col])
        vehicle_df['z-score'] = stats.zscore(vehicle_df)
        vehicle_df.columns = ['accident_count', 'z-score']
        vehicle_df['ratio'] = vehicle_df.accident_count / accidents_by_day.accident_count
        by_vehicle[col] = vehicle_df
        
type_by_day(vehicle_by_day, accidents_by_day)

print(by_vehicle.keys())
by_vehicle[9] # index 9 == Car

dict_keys([-1, 1, 2, 3, 4, 5, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22, 23, 90, 97, 98])


Unnamed: 0_level_0,accident_count,z-score,ratio
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11066.0,-2.00128,0.744734
2,12942.0,-0.405215,0.687307
3,13678.0,0.22096,0.683046
4,13959.0,0.460029,0.684533
5,14058.0,0.544257,0.681336
6,15156.0,1.478414,0.697565
7,13069.0,-0.297166,0.737237


In [None]:
def type_by_day(by_vehicle, accidents_by_day, vehicle):
    vehicle_by_day = group_by_day(by_vehicle, 'vehicle_type', True)
    print(vehicle_by_day)
    vehicle_by_day = pd.DataFrame(vehicle_by_day.sum(axis=1))
    vehicle_by_day['z-score'] = stats.zscore(vehicle_by_day[0])
    vehicle_by_day.columns = ['accident_count', 'z-score']
    vehicle_by_day['ratio'] = vehicle_by_day.accident_count / accidents_by_day.accident_count
    return vehicle_by_day

motorcycle_by_day = type_by_day(vehicle_by_day, accidents_by_day, 5)
motorcycle_by_day

In [None]:
# motorcycle accidents by day of the week
motorcycle_by_day = group_by_day(by_vehicle, 'vehicle_type', True).loc[:, 2:5]
print(motorcycle_by_day)
motorcycle_by_day = pd.DataFrame(motorcycle_by_day.sum(axis=1))
motorcycle_by_day['z-sore'] = stats.zscore(motorcycle_by_day[0])
motorcycle_by_day.columns = ['accident_count', 'z-score']
motorcycle_by_day['ratio'] = motorcycle_by_day.accident_count / accidents_by_day.accident_count

motorcycle_by_day

In [None]:
motorcycle_by_day.accident_count / accidents_by_day.accident_count

### Total accidents by hour of the day

In [None]:
# grouping by time of day
accidents_by_hour = group_by_time(accidents, freq='H', function='count')
accidents_by_hour['zscore'] = stats.zscore(accidents_by_hour['accident_index'])
accidents_by_hour.columns = ['accident_count', 'z-score']
accidents_by_hour

### Motorcycle accidents by hour of the day

In [None]:
# merge the accidents and vehicle datasets
by_vehicle = pd.merge(accidents, vehicles, on='accident_index')
# motorcycle accidents by hour
motorcycle_by_hour = group_by_time(by_vehicle, 'H', 'vehicle_type', True).loc[:, 2:5]

motorcycle_by_hour = pd.DataFrame(motorcycle_by_hour.sum(axis=1))
motorcycle_by_hour['zscore'] = stats.zscore(motorcycle_by_hour[0])
motorcycle_by_hour.columns = ['accident_count', 'z-score']

motorcycle_by_hour

In [None]:
motorcycle_by_hour.accident_count / accidents_by_hour.accident_count

In [None]:
accidents_by_day

## Clustering on location

In [None]:
coords = accidents.filter(['longitude', 'latitude'])
coords.head(5)

km = KMeans(n_clusters=25)

centers = km.fit(coords).cluster_centers_

# analysing kmeans clustering for different number of clusters
inertias = {}

for i in range(2, 26):
    km = KMeans(n_clusters=i)
    fit = km.fit(coords)
    inertias[i] = fit.inertia_
    
fig = plt.subplots(figsize=(6,6))

plt.plot(inertias.keys(), inertias.values())
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Inertia of Clusters')

In [None]:
km = KMeans(n_clusters=8)
centers = km.fit(coords).cluster_centers_
centers = list(zip(*centers))


fig = plt.figure(figsize=(8, 8))
m = Basemap(llcrnrlon=-10.5,llcrnrlat=49.5,urcrnrlon=3.5,urcrnrlat=59.5,
            resolution='i',projection='tmerc',lon_0=-4.36,lat_0=54.7)
m.shadedrelief()
m.drawcoastlines(color='grey')
m.drawcountries(color='gray')

m.scatter(accidents.longitude, accidents.latitude,
          latlon=True,
          alpha=1,
          s=1,
          marker='o',
          label='Accidents')

m.scatter(centers[0], centers[1],
          latlon=True,
          alpha=1,
          s=20,
          color='cyan',
          marker='o',
          label='Cluster Centres')

plt.legend(loc='upper left')

fig.savefig(plots_dir / 'accident_clusters')

## Clustering on weather conditions and speed limit

In [None]:
speed_weather = accidents.filter(['weather_conditions', 'speed_limit'])

km = KMeans(n_clusters=12)

sw_centers = list(zip(*km.fit(speed_weather).cluster_centers_))

import matplotlib.pyplot as plt

figure = plt.subplots(figsize=(4,4))

plt.scatter(accidents['weather_conditions'], accidents['speed_limit'], s=0.2)
plt.scatter(sw_centers[0], sw_centers[1])
plt.ylabel('speed limit (mph)')
plt.xlabel('weather conditions')
plt.xticks(range(1, 10))

plt.show()

# clusters are around 2 so means raining