<h5>Importing libraries</h5>

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import warnings
warnings.filterwarnings('ignore')

<h3>Importing dataset and converting it into column</h3>

In [3]:
uk = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/uk_road_accident.csv')

In [4]:
uk

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


In [5]:
uk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Index                    660679 non-null  object 
 1   Accident_Severity        660679 non-null  object 
 2   Accident Date            660679 non-null  object 
 3   Latitude                 660654 non-null  float64
 4   Light_Conditions         660679 non-null  object 
 5   District Area            660679 non-null  object 
 6   Longitude                660653 non-null  float64
 7   Number_of_Casualties     660679 non-null  int64  
 8   Number_of_Vehicles       660679 non-null  int64  
 9   Road_Surface_Conditions  659953 non-null  object 
 10  Road_Type                656159 non-null  object 
 11  Urban_or_Rural_Area      660664 non-null  object 
 12  Weather_Conditions       646551 non-null  object 
 13  Vehicle_Type             660679 non-null  object 
dtypes: f

<h3>Changing the data types of the columns</h3>

In [6]:
uk.dtypes

Unnamed: 0,0
Index,object
Accident_Severity,object
Accident Date,object
Latitude,float64
Light_Conditions,object
District Area,object
Longitude,float64
Number_of_Casualties,int64
Number_of_Vehicles,int64
Road_Surface_Conditions,object


In [7]:
uk['Index'] = uk['Index'].astype('category')
uk['Accident_Severity'] = uk['Accident_Severity'].astype('category')
uk['Accident Date'] = uk['Accident Date'].astype('category')
uk['Light_Conditions'] = uk['Light_Conditions'].astype('category')
uk['District Area'] = uk['District Area'].astype('category')
uk['Road_Surface_Conditions'] = uk['Road_Surface_Conditions'].astype('category')
uk['Road_Type'] = uk['Road_Type'].astype('category')
uk['Urban_or_Rural_Area'] = uk['Urban_or_Rural_Area'].astype('category')
uk['Weather_Conditions'] = uk['Weather_Conditions'].astype('category')
uk['Vehicle_Type'] = uk['Vehicle_Type'].astype('category')

In [8]:
uk.dtypes

Unnamed: 0,0
Index,category
Accident_Severity,category
Accident Date,category
Latitude,float64
Light_Conditions,category
District Area,category
Longitude,float64
Number_of_Casualties,int64
Number_of_Vehicles,int64
Road_Surface_Conditions,category


<h3>Cleaning empty datas</h3>

In [9]:
uk.isna().sum()

Unnamed: 0,0
Index,0
Accident_Severity,0
Accident Date,0
Latitude,25
Light_Conditions,0
District Area,0
Longitude,26
Number_of_Casualties,0
Number_of_Vehicles,0
Road_Surface_Conditions,726


<h5>Cleaning empty numerical datas</h5>

In [10]:
uk['Latitude'] = uk['Latitude'].fillna(uk['Latitude'].mean())
uk['Longitude'] = uk['Longitude'].fillna(uk['Longitude'].mean())

In [11]:
uk.isna().sum()

Unnamed: 0,0
Index,0
Accident_Severity,0
Accident Date,0
Latitude,0
Light_Conditions,0
District Area,0
Longitude,0
Number_of_Casualties,0
Number_of_Vehicles,0
Road_Surface_Conditions,726


<h5>Cleaning non-numerical empty datas </h5>

In [12]:
uk['Road_Surface_Conditions'] = uk['Road_Surface_Conditions'].fillna(uk['Road_Surface_Conditions'].mode()[0])
uk['Road_Type'] = uk['Road_Type'].fillna(uk['Road_Type'].mode()[0])
uk['Urban_or_Rural_Area'] = uk['Urban_or_Rural_Area'].fillna(uk['Urban_or_Rural_Area'].mode()[0])
uk['Weather_Conditions'] = uk['Weather_Conditions'].fillna(uk['Weather_Conditions'].mode()[0])

In [13]:
uk.isna().sum()

Unnamed: 0,0
Index,0
Accident_Severity,0
Accident Date,0
Latitude,0
Light_Conditions,0
District Area,0
Longitude,0
Number_of_Casualties,0
Number_of_Vehicles,0
Road_Surface_Conditions,0


<h1>Exploratory Data Analytics</h5>

<h1>1. What percentages do each accident severity represent on the accident report? and which severity had the highest percentage on the report.

In [14]:
uk['Accident_Severity'].unique()

['Serious', 'Slight', 'Fatal']
Categories (3, object): ['Fatal', 'Serious', 'Slight']

In [15]:
severity = uk['Accident_Severity']

In [16]:
Slight, Serious, Fatal = severity.value_counts()

<p>Percentage of Slight category</p>

In [17]:
(Slight / len(uk)) * 100

85.33660067899842

<p>Slight severity consumes <strong>85%</strong> of overall reports</p>

<p>Percentage of serious category</p>

In [18]:
(Serious / len(uk)) * 100

13.352475256516403

<p>Serious severity consumes <strong>13%</strong> of overall reports</p>

In [19]:
(Fatal / len(uk)) * 100

1.3109240644851736

<p>Fatal severity consumes <strong>1%</strong> of overall reports</p>

<h1>Insight # 1: With the calculation above, we found out that Serious accident severity consumes 85.33% of the overall reportts while Serious and Fatal have 13.35% and 1.31% respectively</h1>

<h1>2. Does the weather condition and vehicle type affect the number of casualties? If so, which weather condition and vehicle type recorded the highest number of casualties?

In [20]:
 pd.set_option('display.max_colwidth', 500)

In [21]:
weatherandvehicle = uk.groupby(['Weather_Conditions', "Vehicle_Type"])['Number_of_Casualties'].size()

In [22]:
uk['Weather_Conditions'].value_counts()

Unnamed: 0_level_0,count
Weather_Conditions,Unnamed: 1_level_1
Fine no high winds,535013
Raining no high winds,79696
Other,17150
Raining + high winds,9615
Fine + high winds,8554
Snowing no high winds,6238
Fog or mist,3528
Snowing + high winds,885


<h3>Breakdown</h3>

In [23]:
# Fine and no high winds
weatherandvehicle['Fine no high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,403324
Van / Goods 3.5 tonnes mgw or under,27603
Bus or coach (17 or more pass seats),20963
Motorcycle over 500cc,20793
Goods 7.5 tonnes mgw and over,13938
Motorcycle 125cc and under,12407
Taxi/Private hire car,10733
Motorcycle over 125cc and up to 500cc,6228
Motorcycle 50cc and under,6165
Goods over 3.5t. and under 7.5t,4950


In [24]:
# raining no high winds
weatherandvehicle['Raining no high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,59940
Van / Goods 3.5 tonnes mgw or under,4203
Bus or coach (17 or more pass seats),3182
Motorcycle over 500cc,3135
Goods 7.5 tonnes mgw and over,2114
Motorcycle 125cc and under,1830
Taxi/Private hire car,1622
Motorcycle 50cc and under,897
Motorcycle over 125cc and up to 500cc,893
Goods over 3.5t. and under 7.5t,729


In [25]:
# raining+ high winds
weatherandvehicle['Raining + high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,7206
Van / Goods 3.5 tonnes mgw or under,508
Bus or coach (17 or more pass seats),389
Motorcycle over 500cc,354
Goods 7.5 tonnes mgw and over,275
Motorcycle 125cc and under,220
Taxi/Private hire car,190
Motorcycle over 125cc and up to 500cc,117
Motorcycle 50cc and under,107
Other vehicle,99


In [26]:
# Fine + high winds
weatherandvehicle['Fine + high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,6463
Van / Goods 3.5 tonnes mgw or under,408
Bus or coach (17 or more pass seats),350
Motorcycle over 500cc,315
Goods 7.5 tonnes mgw and over,251
Taxi/Private hire car,187
Motorcycle 125cc and under,180
Motorcycle 50cc and under,106
Goods over 3.5t. and under 7.5t,94
Motorcycle over 125cc and up to 500cc,87


In [27]:
# snowing no high winds
weatherandvehicle['Snowing no high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,4748
Van / Goods 3.5 tonnes mgw or under,296
Motorcycle over 500cc,254
Bus or coach (17 or more pass seats),219
Goods 7.5 tonnes mgw and over,149
Motorcycle 125cc and under,146
Taxi/Private hire car,136
Motorcycle 50cc and under,75
Motorcycle over 125cc and up to 500cc,72
Goods over 3.5t. and under 7.5t,52


In [28]:
# Fog or mist
weatherandvehicle['Fog or mist'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,2641
Van / Goods 3.5 tonnes mgw or under,192
Bus or coach (17 or more pass seats),134
Motorcycle over 500cc,118
Goods 7.5 tonnes mgw and over,93
Motorcycle 125cc and under,82
Taxi/Private hire car,78
Motorcycle 50cc and under,50
Motorcycle over 125cc and up to 500cc,45
Other vehicle,38


In [29]:
# Snowing + high winds
weatherandvehicle['Snowing + high winds'].sort_values(ascending = False)

Unnamed: 0_level_0,Number_of_Casualties
Vehicle_Type,Unnamed: 1_level_1
Car,677
Van / Goods 3.5 tonnes mgw or under,48
Motorcycle over 500cc,33
Bus or coach (17 or more pass seats),32
Goods 7.5 tonnes mgw and over,28
Taxi/Private hire car,20
Motorcycle 125cc and under,16
Motorcycle over 125cc and up to 500cc,10
Motorcycle 50cc and under,6
Goods over 3.5t. and under 7.5t,6


<h1>Insight #2:</h1>
<h5>Car consistently dominated the vehucle type per casualties. It remained as the number one cause of accident regardlessof the weather condition. Surprisingly, contrary to the popular belief that wet roads causes more accidents, the data shows that 500000 of the overall report occured on a dry weather.

<h1>Question #3: Which is the safest? Urban or Rural area. Determine if the Light condition and Road type of an area determine the number of casualties it  will record. </h1>

In [30]:
uk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   Index                    660679 non-null  category
 1   Accident_Severity        660679 non-null  category
 2   Accident Date            660679 non-null  category
 3   Latitude                 660679 non-null  float64 
 4   Light_Conditions         660679 non-null  category
 5   District Area            660679 non-null  category
 6   Longitude                660679 non-null  float64 
 7   Number_of_Casualties     660679 non-null  int64   
 8   Number_of_Vehicles       660679 non-null  int64   
 9   Road_Surface_Conditions  660679 non-null  category
 10  Road_Type                660679 non-null  category
 11  Urban_or_Rural_Area      660679 non-null  category
 12  Weather_Conditions       660679 non-null  category
 13  Vehicle_Type             660679 non-null  ca

In [31]:
urbanorrural = uk.groupby(['Urban_or_Rural_Area','Light_Conditions','Road_Type'])['Number_of_Casualties'].count()

<h3>Breakdown</h3>

In [32]:
# Rural
urbanorrural['Rural']

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties
Light_Conditions,Road_Type,Unnamed: 2_level_1
Darkness - lighting unknown,Dual carriageway,512
Darkness - lighting unknown,One way street,15
Darkness - lighting unknown,Roundabout,188
Darkness - lighting unknown,Single carriageway,1702
Darkness - lighting unknown,Slip road,50
Darkness - lights lit,Dual carriageway,6556
Darkness - lights lit,One way street,182
Darkness - lights lit,Roundabout,3026
Darkness - lights lit,Single carriageway,14248
Darkness - lights lit,Slip road,683


In [33]:
urbanorrural['Urban']

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties
Light_Conditions,Road_Type,Unnamed: 2_level_1
Darkness - lighting unknown,Dual carriageway,393
Darkness - lighting unknown,One way street,109
Darkness - lighting unknown,Roundabout,282
Darkness - lighting unknown,Single carriageway,3197
Darkness - lighting unknown,Slip road,36
Darkness - lights lit,Dual carriageway,13686
Darkness - lights lit,One way street,3128
Darkness - lights lit,Roundabout,7501
Darkness - lights lit,Single carriageway,79691
Darkness - lights lit,Slip road,632


In [34]:
# Checking if Rural recorded more number of casualties than Urban
(urbanorrural['Rural'] > urbanorrural['Urban']).value_counts()

Unnamed: 0_level_0,count
Number_of_Casualties,Unnamed: 1_level_1
False,14
True,11


<p>We could say here that rural is safer than Urban, but we still need to compare the number of casualties each have</p>

In [35]:
urbanorrural['Rural'].sum()

np.int64(238990)

In [36]:
urbanorrural['Urban'].sum()

np.int64(421678)

<p>Now, let's determine which Road Type and Light condition from each area recorded the highest number of casualties

In [37]:
urbanorrural['Rural'].sort_values(ascending=False).head(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties
Light_Conditions,Road_Type,Unnamed: 2_level_1
Daylight,Single carriageway,124756
Daylight,Dual carriageway,34374
Darkness - no lighting,Single carriageway,27926
Darkness - lights lit,Single carriageway,14248
Daylight,Roundabout,12088


In [38]:
urbanorrural['Urban'].sort_values(ascending=False).head(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties
Light_Conditions,Road_Type,Unnamed: 2_level_1
Daylight,Single carriageway,241970
Darkness - lights lit,Single carriageway,79691
Daylight,Dual carriageway,35977
Daylight,Roundabout,20493
Darkness - lights lit,Dual carriageway,13686


<h1>Insight #3:</h1>
<p>Daylight recorded the highest number of casualties, this would mean that drivers were more careful at night that they don't take extra car during daytime. Also, single carriageway road types recorded the highest number of casualties

<h1>Question #4: Traffic is one of the main source of accidents, determine which district areas recorded an accident involving more than 5 cars</h1>

In [39]:
more_than_5 = uk[uk['Number_of_Casualties'] > 5]
uk.groupby(more_than_5['District Area'])['Number_of_Casualties'].count().sort_values(ascending=False)

Unnamed: 0_level_0,Number_of_Casualties
District Area,Unnamed: 1_level_1
Birmingham,70
Bradford,59
Liverpool,59
Leeds,48
Manchester,43
...,...
Bracknell Forest,0
Berwick-upon-Tweed,0
Bexley,0
Wellingborough,0


<h1>Insight #4:</h1>
<p>Birmingham District recorded 70 casualties involving more than 5 cars</p>

<h1>Question #5: Since Birminham District has the highest number of casualties involving more than 5 cars, determine what type of road surface condition they have.</h1>

In [46]:
birmingham = uk[uk['District Area'] == 'Birmingham']
birmingham_road_surface = birmingham[birmingham['Number_of_Casualties'] > 5]['Road_Surface_Conditions']
birmingham_road_surface.value_counts()

Unnamed: 0_level_0,count
Road_Surface_Conditions,Unnamed: 1_level_1
Dry,48
Wet or damp,20
Frost or ice,1
Snow,1
Flood over 3cm. deep,0


<h1>Insight #5:</h1>
<p>Strengthening the conclusion that we had earlier wet surface is not the main source of accident</p>

<h1>Question #6: Which district are have the highest number of fatal accidents</h1>

In [50]:
Fatal_per_district = uk[uk['Accident_Severity'] == 'Fatal']['District Area'].value_counts()

In [52]:
Fatal_per_district.sort_values(ascending=False).head(n=10)

Unnamed: 0_level_0,count
District Area,Unnamed: 1_level_1
Birmingham,105
Leeds,93
Highland,88
East Riding of Yorkshire,85
Bradford,71
Aberdeenshire,66
Powys,59
Doncaster,56
Wakefield,56
"Herefordshire, County of",51


<h1>Insight #6:</h1>
<p>Birmingham district have the highest number of Fatal accidents</p>

<h1>Question #7: What is the common vehicle types involved in Fatal Accidents?</h1>

In [53]:
Vehicle_fatal = uk[uk['Accident_Severity'] == 'Fatal']['Vehicle_Type'].value_counts()

In [54]:
Vehicle_fatal.sort_values(ascending=False).head(n=10)

Unnamed: 0_level_0,count
Vehicle_Type,Unnamed: 1_level_1
Car,6577
Van / Goods 3.5 tonnes mgw or under,467
Motorcycle over 500cc,339
Bus or coach (17 or more pass seats),325
Goods 7.5 tonnes mgw and over,216
Motorcycle 125cc and under,189
Taxi/Private hire car,155
Motorcycle over 125cc and up to 500cc,105
Motorcycle 50cc and under,95
Other vehicle,70


<h1>Insight #7:</h1>
<p>Car remains the number one vehicle involved on fatal accidents</p>

<h1>Question #8: Is there a relationship between latitude and longitude and the number of casualties?</h1>

In [62]:
uk['Number_of_Casualties'].corr(uk['Latitude'])

np.float64(0.03220068662590627)

In [63]:
uk['Number_of_Casualties'].corr(uk['Longitude'])

np.float64(-0.04040564578845446)

<h1>Insight #8:</h1>
<p>There's no relation between longitude, latitude, and number of casualties</p>

<h1>Question #9: Is there a correlation between the number of vehicles involved had the number of casualties</h1>

In [64]:
uk['Number_of_Casualties'].corr(uk['Number_of_Vehicles'])

np.float64(0.22888886126927635)

<h1>Insight #9: No, there is no correlation between Number of vehicles involved and the number of casualties</h1>

<h1>Question #10: Which road surface conditions are prevalent during serious accidents?</h1>

In [70]:
serious_accident = uk[uk['Accident_Severity'] == 'Serious']
surface_condition_serious = serious_accident['Road_Surface_Conditions'].value_counts()
surface_condition_serious.sort_values(ascending=False)

Unnamed: 0_level_0,count
Road_Surface_Conditions,Unnamed: 1_level_1
Dry,61708
Wet or damp,23785
Frost or ice,2007
Snow,565
Flood over 3cm. deep,152


<h1>Insight #10:</h1>
<p>Dry surface are prevalent during serious accidents</h1>

<h1>Insight #11: What are the most common combinations of weather conditions and light conditions during accidents?</h1>

In [73]:
weather_and_light = uk.groupby(['Weather_Conditions','Light_Conditions'])['Number_of_Casualties'].count()
weather_and_light.sort_values(ascending=False).head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties
Weather_Conditions,Light_Conditions,Unnamed: 2_level_1
Fine no high winds,Daylight,408726
Fine no high winds,Darkness - lights lit,93958
Raining no high winds,Daylight,49742
Fine no high winds,Darkness - no lighting,25251
Raining no high winds,Darkness - lights lit,22666
Other,Daylight,10103
Raining no high winds,Darkness - no lighting,6208
Fine + high winds,Daylight,5796
Fine no high winds,Darkness - lighting unknown,5333
Raining + high winds,Daylight,4942


<h1>Insight #11:</h1>
<p>Fine no high winds and daylight are most common combination if weather and light conditions during accidents</p>

<h1>Question #12: What are the most frequent 'Vehicle_Type' involved in accidents under 'Darkness - lights lit' conditions?</h1>

In [74]:
darkness_lights_lit = uk[uk['Light_Conditions'] == 'Darkness - lights lit']
darkness_lights_lit['Vehicle_Type'].value_counts()

Unnamed: 0_level_0,count
Vehicle_Type,Unnamed: 1_level_1
Car,96994
Van / Goods 3.5 tonnes mgw or under,6806
Motorcycle over 500cc,5176
Bus or coach (17 or more pass seats),5142
Goods 7.5 tonnes mgw and over,3440
Motorcycle 125cc and under,3074
Taxi/Private hire car,2658
Motorcycle 50cc and under,1494
Motorcycle over 125cc and up to 500cc,1480
Goods over 3.5t. and under 7.5t,1192


<h1>Insight #12: </h1>
<p>Car is the most common vehicle under Darkness - Lights lit condition</p>