## Introduction
####  Machine Learning model to predict Accident Risk Index by using area/district level accident data.

### Import the necessary packages.

In [1]:
!pip install -q klib

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import re
import klib

### Import train and test dataset

In [3]:
train=pd.read_csv("../input/predict-accident-risk-score-for-unique-postcode/train.csv")
test=pd.read_csv("../input/predict-accident-risk-score-for-unique-postcode/test.csv")

In [4]:
train=train.rename(columns={col:col.replace("-","_").lower() for col in train.columns.values})
test=test.rename(columns={col:col.replace("-","_").lower() for col in test.columns.values})

### Structure of the train and test dataset.

In [5]:
train.info()

#### The above information shows that there are 478741 data entries and 27 columns.

In [6]:
test.info()

#### The above information shows that there are 121259 data entries and 27 columns.

#### Let's see is there any missing values in train and test dataset by column wise.

In [7]:
train.isnull().sum()

In [8]:
train.columns[train.isnull().any()]

#### The above information shows that there are missing values in the columns time, road_surface_conditions, special_conditions_at_site.

In [9]:
test.isnull().sum()

In [10]:
test.columns[test.isnull().any()]

#### The above information shows that there are missing values in the columns time, road_surface_conditions, special_conditions_at_site.

#### Let's see the glimpse of train dataset

In [11]:
train.head(2)

### Let's see is there any duplication

In [12]:
train.duplicated().value_counts()

#### The above information shows that there is no duplication in data.

### Let's explore and visualize each column of the train dataset.First,let's see the target column distribution.

### At postcode wise total casualties from the road accidents divided by the count of accidents, we can get the target column **accident_risk_index**. 

### Function to get the accident_risk_index 

In [13]:
def ari(df):
  df['total_casualties']=df.groupby('postcode')['number_of_casualties'].transform('sum')
  df['accident_count']=df.groupby('postcode')['accident_id'].transform('count')
  df['accident_risk_index']=round(df['total_casualties']/df['accident_count'],2)
  return df

In [14]:
train=ari(train)

In [15]:
train.head(2)

In [16]:
def count_plot(df,col,rot=None):
  _=plt.figure(figsize=(8,6))
  _=sns.countplot(x=df[col],order=df[col].value_counts().index)
  _=plt.title(col.capitalize()+" Distribution",fontsize=25)
  _=plt.xlabel(col,fontsize=20)
  _=plt.xticks(fontsize=14,rotation=rot)

In [17]:
def box_plot(df,col,rot=None):
  _=plt.figure(figsize=(8,6))
  _=sns.boxplot(y=df[col])
  _=plt.title(col.capitalize()+" Distribution",fontsize=25)
  _=plt.ylabel(col,fontsize=20,rotation=rot)
  _=plt.yticks(fontsize=14)



In [21]:
def side_by_side_plot(df,grp,valcol,rot=None):
  clr="tab10"
  fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,8))
  fig.tight_layout()
  sns.kdeplot(x=df[valcol], hue=df[grp],ax=ax1,palette=clr)
  ax1.set_title(grp.capitalize()+" Wise "+valcol.capitalize()+" Distribution",size=15)
  ax1.set_xlabel(valcol,fontsize=20)
  sns.boxplot(x=df[grp],y=df[valcol],ax=ax2)
  ax2.set_title(grp.capitalize()+" Wise "+valcol.capitalize()+" Distribution",size=15)
  ax2.set_xlabel(grp,fontsize=20)
  ax2.tick_params(rotation=rot)

In [22]:
def group_summary(df,groupcol,value):
    return df.groupby(groupcol)[value].describe().reset_index().sort_values('mean',ascending=False)

### Let's see the accident_risk_index column distribution.

In [23]:
klib.dist_plot(train['accident_risk_index']);
plt.title("accident risk index distribution".capitalize(),fontsize=20);

In [24]:
train['accident_risk_index'].describe()

In [None]:
box_plot(train,'accident_risk_index')

#### The above histogram explains that accident_risk_index column is postively skewed and also there are three peeks(multi-modal distribution).

### The accident_risk_index ranges from 1.0 to 5.0.

### The average accident_risk_index value is 1.47.

#### The boxplot explains that there are outliers above the third quartile.

### Let's see how many states' accident information is collected and see accident casualties of each state?

In [26]:
count_plot(train,col='state')

In [27]:
train['state'].value_counts()

In [28]:
side_by_side_plot(train,'state','accident_risk_index')

In [29]:
group_summary(train,'state','accident_risk_index')

#### The above group-wise box plot explains that mean of the accident_risk_index is almost the same in each state.

#### There are outliers above the third quartile and below the first quartile.

### Let's see how many unique postcodes are there and see which postcode area has the maximum road accident casualties.

In [30]:
train['postcode'].nunique()

#### There are 95625 unique pincodes are there.

### The postcode divided into four parts for instance **OX3 9AP**, the **OX** is area, **3** is district, **9** is sector, and **UP** is unit(street, property, organization).

### In this data the postcode length is in various size.For instance,

* DL145 8BG ->(length:9)
* BN21 2XR ->(length:8)
* OX3 9UP ->(length:7)
* E5 9QH ->(length:6)
* CB1 4 ->(length:5)
* PE21 ->(length:4)
* GU9 ->(length:3)
* S8 ->(length:2)

### Let's use the regex function to retrieve the four parts in postcode.

In [31]:
def postcode_split(df):
  df['postcode']=df['postcode'].str.strip()
  df['area']=[re.search(r"([A-Z]+)",i).group(1) for i in df['postcode']]
  df['district']=[re.search(r"[A-Z]+([0-9]+)",i).group(1) if re.search(r"[A-Z]+([0-9]+)",i) else "none" for i in df['postcode']]
  df['sector']=[re.search(r"(\s[0-9]+)",i).group(1) if re.search(r"(\s[0-9]+?)",i) else "none" for i in df['postcode']]
  df['unit']=[re.search(r"\s[0-9]+([A-Z]+)",i).group(1) if re.search(r"\s[0-9]+?([A-Z]+)",i) else "none" for i in df['postcode']]
  return df

In [32]:
train=postcode_split(train)

#### Let's see in which area, district, sector, and unit have most accidents occurred.

In [33]:
fig=plt.subplots(figsize=(15,15))
for i,col in enumerate(['area','district','sector','unit']):
  _=plt.subplot(2,2,i+1)
  df=train.groupby(col)['accident_id'].agg({'count'}).reset_index().nlargest(20,'count')
  _=sns.barplot(data=df,x='count',y=col)
  _=plt.title("Most Road Accident "+col.capitalize()+'s',fontsize=15)
  _=plt.ylabel("")
  _=plt.yticks(fontsize=15)
  _=plt.tight_layout()
plt.show()    

#### Let's see in which area, district, sector, and the unit have the most road accident casualties.

In [34]:
fig=plt.subplots(figsize=(15,15))
for i,col in enumerate(['area','district','sector','unit']):
  _=plt.subplot(2,2,i+1)
  df=train.groupby(col)['number_of_casualties'].agg({'mean'}).reset_index().nlargest(20,'mean')
  _=sns.barplot(data=df,x='mean',y=col)
  _=plt.title("Most Road Accident "+col.capitalize()+'s',fontsize=15)
  _=plt.ylabel("")
  _=plt.yticks(fontsize=15)
  _=plt.tight_layout()
plt.show()    

#### Let's see in which area, district, sector, and unit have the high accidents risk index.

In [35]:
fig=plt.subplots(figsize=(15,15))
for i,col in enumerate(['area','district','sector','unit']):
  _=plt.subplot(2,2,i+1)
  df=train.groupby(col)['accident_risk_index'].agg({'mean'}).reset_index().nlargest(20,'mean')
  df1=train[train[col].isin(list(df[col].values))][[col,'accident_risk_index']]
  order = df1.groupby(col)["accident_risk_index"].median().sort_values(ascending=False).index
  _=sns.boxplot(x=df1[col],y=df1['accident_risk_index'],order=order)
  _=plt.title(col.capitalize()+"'s"+" Accident Risk Index Distribution",fontsize=15)
  _=plt.xlabel("")
  _=plt.xticks(fontsize=15)
  _=plt.tight_layout()
plt.show()  

#### The above boxplots are ordered by the median value and it explains that there is a lot of differences between most road accident locations and the locations which have the most accident casualties and also most accident risk index locations.

### Let' compare the number of police force and accident risk index, accident count, and total casualties.

In [36]:
plt.figure(figsize=(25,7))
sns.boxplot(x=train['police_force'].astype('str'),y=train['accident_risk_index'],
            order=(train.astype({'police_force':'str'}).groupby('police_force')['accident_risk_index'].median().
                   sort_values(ascending=True).index));
plt.xlabel("police_force",fontsize=18);
plt.xticks(fontsize=15); 

In [37]:
plt.figure(figsize=(25,7))
sns.barplot(data=train.astype({'police_force':'str'}).groupby('police_force')['number_of_casualties'].agg({'sum'}).reset_index(),
    x='police_force',y='sum',
            order=(train.astype({'police_force':'str'}).groupby('police_force')['number_of_casualties'].sum().
                   sort_values(ascending=False).index));
plt.xlabel("police_force",fontsize=18);
plt.xticks(fontsize=15) ;

In [38]:
plt.figure(figsize=(25,7))
sns.barplot(data=train.astype({'police_force':'str'}).groupby('police_force')['accident_id'].agg({'count'}).reset_index(),
    x='police_force',y='count',
            order=(train.astype({'police_force':'str'}).groupby('police_force')['total_casualties'].count().
                   sort_values(ascending=False).index));
plt.xlabel("police_force",fontsize=18);
plt.xticks(fontsize=15);                   

#### The above boxplot the accident risk index's median values is almost similar.

#### There are some less accident risk index in the some areas.

#### There average number of police force is 31.

#### The number of police force ranges from 1 to 98.

#### The area which has 24 police force is recorded the very lowest accident risk index(1.0).

#### There is a lot of differences between most road accident locations and the locations which have the most accident casualties and also most accident risk index locations.

### Let's see the number of vehicles on accident location and compare it with the accident risk index.

In [39]:
count_plot(train,col='number_of_vehicles')

In [40]:
train['number_of_vehicles'].value_counts()

In [41]:
side_by_side_plot(train,'number_of_vehicles','accident_risk_index')

In [42]:
group_summary(train,'number_of_vehicles','accident_risk_index')

### Let's compare the state-wise accident risk index by each number of vehicles at the accident location.

In [43]:
sns.catplot(x='number_of_vehicles',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [44]:
group_summary(train,['state','number_of_vehicles'],'accident_risk_index')

#### The above plot explains that there are mostly 1 or 2 vehicles at the location of the accident.

#### The average accident risk index is almost the same in all vehicle groups. 



### Let's compare the state-wise total accident by each number of vehicles at the accident location.

In [45]:
sns.catplot(x='number_of_vehicles',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'number_of_vehicles':'str'}).groupby(['state','number_of_vehicles'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");

In [46]:
(train.astype({'number_of_vehicles':'str'}).groupby(['state','number_of_vehicles'])
            .agg({'accident_id':'count'}).reset_index().sort_values('accident_id',ascending=False))

#### In all states, most of the accidents happened due to two-vehicle crashes.

### Let's compare the state-wise total accident casualties by each number of vehicles at the accident location.

In [47]:
sns.catplot(x='number_of_vehicles',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'number_of_vehicles':'str'}).groupby(['state','number_of_vehicles'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");

In [48]:
(train.astype({'number_of_vehicles':'str'}).groupby(['state','number_of_vehicles'])
            .agg({'number_of_casualties':'sum'}).reset_index().sort_values('number_of_casualties',ascending=False))

#### In all states, most of the accident casualties happened due to two-vehicle crashes.

### Let's comapre the accident risk index with road type.

In [49]:
count_plot(train,'road_type',rot=70)

In [50]:
train['road_type'].value_counts()

In [51]:
side_by_side_plot(train,'road_type','accident_risk_index',rot=60)

In [52]:
group_summary(train,'road_type','accident_risk_index')

#### The above plot explains that in all area most of roads are single carriageway.

#### The average accident risk index is almost the same in all road types. 



### Let's compare the state-wise accident risk index by type of the road.

In [53]:
sns.catplot(y='road_type',x='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [54]:
group_summary(train,['state','road_type'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar in all road types.

### Let's compare the state-wise total accident count by type of the road.

In [55]:
sns.catplot(y='road_type',x='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'road_type':'str'}).groupby(['state','road_type'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [56]:
(train.astype({'road_type':'str'}).groupby(['state','road_type'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### Most of the accidents happened in single carriageways and in the state of England.

### Let's compare the state-wise total accident casualties by type of the road.

In [57]:
sns.catplot(y='road_type',x='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'road_type':'str'}).groupby(['state','road_type'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [58]:
(train.astype({'road_type':'str'}).groupby(['state','road_type'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### Most of the accident casualties are happened in single carriageway and in the state of England.

### Let's comapre the accident risk index with speed limit.

In [59]:
count_plot(train,'speed_limit')

In [60]:
train['speed_limit'].value_counts()

In [61]:
side_by_side_plot(train,'speed_limit','accident_risk_index')

In [62]:
group_summary(train,'speed_limit','accident_risk_index')

#### The above plot explains that in all area most of roads have 30mph speed limit.

#### The average accident risk index is almost the same in all speed limit category. 

### Let's compare the state-wise accident risk index by type of the road.

In [63]:
sns.catplot(x='speed_limit',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [64]:
group_summary(train,['state','speed_limit'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar in all speed limit categoey.

### Let's see the speed limit of each road type.

In [65]:
sns.catplot(y='road_type',x='speed_limit',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

#### The above plot explains that the speed limit changes based on the road types in all states of the united kingdom. 

### Let's compare the state-wise total accident count by speed limit categoey.

In [66]:
sns.catplot(x='speed_limit',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'speed_limit':'str'}).groupby(['state','speed_limit'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [67]:
(train.astype({'speed_limit':'str'}).groupby(['state','speed_limit'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### Most of the accidents happened in 30mph speed limit roads and in the state of England.

### Let's compare the state-wise total accident casualties by speed limit category.

In [68]:
sns.catplot(x='speed_limit',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'speed_limit':'str'}).groupby(['state','speed_limit'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [69]:
(train.astype({'road_type':'str'}).groupby(['state','road_type'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### Most of the accident casualties happened in 30mph speed limit roads and in the state of England.

#### Let's see how the accident risk index changes on the 1st and 2nd road class types.

In [70]:
count_plot(train,'1st_road_class')
count_plot(train,'2nd_road_class')

In [71]:
train['1st_road_class'].value_counts()

In [72]:
train['2nd_road_class'].value_counts()

In [73]:
side_by_side_plot(train,'1st_road_class','accident_risk_index')

In [74]:
group_summary(train,'1st_road_class','accident_risk_index')

#### The average accident risk index is almost the same in all categories of 1st road class. 

In [75]:
side_by_side_plot(train,'2nd_road_class','accident_risk_index')

In [76]:
group_summary(train,'2nd_road_class','accident_risk_index')

#### The average accident risk index is almost the same in all categories of 2nd road class. 

### Let's compare the state-wise accident risk index by type of 1st road class.

In [77]:
sns.catplot(x='1st_road_class',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [78]:
group_summary(train,['state','1st_road_class'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar in all categories of 1st road class.

### Let's compare the state-wise accident risk index by type of 2nd road class.

In [79]:
sns.catplot(x='2nd_road_class',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [80]:
group_summary(train,['state','2nd_road_class'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar in all categories of 2nd road class.

### Let's compare the state-wise total accident count by 1st road class.

In [81]:
sns.catplot(x='1st_road_class',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'1st_road_class':'str'}).groupby(['state','1st_road_class'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [82]:
(train.astype({'1st_road_class':'str'}).groupby(['state','1st_road_class'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### Most of the accidents happened in road class 3, 6 and in the state of England.

### Let's compare the state-wise total accident count by 2nd road class.

In [83]:
sns.catplot(x='2nd_road_class',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'2nd_road_class':'str'}).groupby(['state','2nd_road_class'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [84]:
(train.astype({'2nd_road_class':'str'}).groupby(['state','2nd_road_class'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### Most of the accidents happened in road class -1, 6 and in all the states of the United Kingdom.

### Let's compare the state-wise total accident casualties by 1st road class.

In [85]:
sns.catplot(x='1st_road_class',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'1st_road_class':'str'}).groupby(['state','1st_road_class'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [86]:
(train.astype({'1st_road_class':'str'}).groupby(['state','1st_road_class'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### Most of the accident casualties happened in 1st road classes 3, 6, and in all states of the united kingdom.

### Let's compare the state-wise total accident casualties by 2nd road class.

In [87]:
sns.catplot(x='2nd_road_class',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'2nd_road_class':'str'}).groupby(['state','2nd_road_class'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [88]:
(train.astype({'2nd_road_class':'str'}).groupby(['state','2nd_road_class'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### Most of the accident casualties happened in 2nd road classes -1, 6, and in all states of the united kingdom.

#### Let's see whether the pedestrian crossing human control system will reduce the road accident risk index?

In [89]:
count_plot(train,'pedestrian_crossing_human_control',rot=60)

In [90]:
train['pedestrian_crossing_human_control'].value_counts()

#### Most road pedestrian crossing doesn't have any human control system facility. If it's present it's mostly controlled by the authorized person.

In [91]:
side_by_side_plot(train,'pedestrian_crossing_human_control','accident_risk_index')

In [92]:
group_summary(train,'pedestrian_crossing_human_control','accident_risk_index')

#### There is no significant correlation between the average accident risk index and the road which have pedestrian crossing human control facility.

### Let's compare the state-wise accident risk index with pedestrian crossing human control system facility.

In [93]:
sns.catplot(y='pedestrian_crossing_human_control',x='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [94]:
group_summary(train,['state','pedestrian_crossing_human_control'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar whether the road has pedestrian crossing system or not.

### Let's compare the state-wise total accident count by pedestrian crossing system.

In [95]:
sns.catplot(y='pedestrian_crossing_human_control',x='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'pedestrian_crossing_human_control':'str'}).groupby(['state','pedestrian_crossing_human_control'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [96]:
(train.astype({'pedestrian_crossing_human_control':'str'}).groupby(['state','pedestrian_crossing_human_control'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom, the area which doesn't have a pedestrian crossing human control system is the high accident area.

### Let's compare the state-wise total accident casualties by pedestrian crossing human system.

In [97]:
sns.catplot(y='pedestrian_crossing_human_control',x='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'pedestrian_crossing_human_control':'str'}).groupby(['state','pedestrian_crossing_human_control'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [98]:
(train.astype({'pedestrian_crossing_human_control':'str'}).groupby(['state','pedestrian_crossing_human_control'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom, the area which doesn't have a pedestrian crossing human control system is the high accident casualties area.

#### Let's see whether the pedestrian crossing physical facilities will reduce the road accident risk index?

In [99]:
count_plot(train,'pedestrian_crossing_physical_facilities',rot=80)

In [100]:
train['pedestrian_crossing_physical_facilities'].value_counts()

#### Most roads don't have a physical pedestrian crossing facility.

In [101]:
side_by_side_plot(train,'pedestrian_crossing_physical_facilities','accident_risk_index',rot=70)

In [102]:
group_summary(train,'pedestrian_crossing_physical_facilities','accident_risk_index')

#### There is no significant correlation between the average accident risk index and the road which have pedestrian crossing human control facility. But if the road has a footbridge or subway the accident risk index is very low when compared to other pedestrian crossing facilities.

### Let's compare the state-wise accident risk index with pedestrian crossing physical facilities.

In [103]:
sns.catplot(y='pedestrian_crossing_physical_facilities',x='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [104]:
group_summary(train,['state','pedestrian_crossing_physical_facilities'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar if the road has a physical pedestrian crossing facility or not. Except for footbridge or subway crossing facility.

### Let's compare the state-wise total accident count by pedestrian crossing facility.

In [105]:
sns.catplot(y='pedestrian_crossing_physical_facilities',x='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'pedestrian_crossing_physical_facilities':'str'}).groupby(['state','pedestrian_crossing_physical_facilities'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [106]:
(train.astype({'pedestrian_crossing_physical_facilities':'str'}).groupby(['state','pedestrian_crossing_physical_facilities'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom, the area which doesn't have a pedestrian crossing facility is the high accident area.

### Let's compare the state-wise total accident casualties by pedestrian crossing facility.

In [107]:
sns.catplot(y='pedestrian_crossing_physical_facilities',x='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'pedestrian_crossing_physical_facilities':'str'}).groupby(['state','pedestrian_crossing_physical_facilities'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [108]:
(train.astype({'pedestrian_crossing_physical_facilities':'str'}).groupby(['state','pedestrian_crossing_physical_facilities'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom, the area which doesn't have a pedestrian crossing facility is the high accident casualties area.

#### Let's see road's lighting condition and accident risk index.

In [109]:
count_plot(train,'light_conditions',rot=80)

In [110]:
train['light_conditions'].value_counts()

#### Most of the accidents happened in daylight.

In [111]:
side_by_side_plot(train,'light_conditions','accident_risk_index',rot=70)

In [112]:
group_summary(train,'light_conditions','accident_risk_index')

#### There is no significant correlation between the average accident risk index and the road's lighting condition. 

### Let's compare the state-wise accident risk index with light conditions.

In [113]:
sns.catplot(y='light_conditions',x='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [114]:
group_summary(train,['state','light_conditions'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar if the light condition is darker or daylight. Except for the lighting condition darkness without light.

### Let's compare the state-wise total accident count by light condition.

In [115]:
sns.catplot(y='light_conditions',x='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'light_conditions':'str'}).groupby(['state','light_conditions'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [116]:
(train.astype({'light_conditions':'str'}).groupby(['state','light_conditions'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom,   higher number of accidents in daylight, and darkness.

### Let's compare the state-wise total accident casualties by light condition.

In [117]:
sns.catplot(y='light_conditions',x='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'light_conditions':'str'}).groupby(['state','light_conditions'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [118]:
(train.astype({'light_conditions':'str'}).groupby(['state','light_conditions'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom,   higher number of accident casualties in daylight, and darkness.

#### Let's see weather condition and accident risk index.

In [119]:
count_plot(train,'weather_conditions',rot=80)

In [120]:
train['weather_conditions'].value_counts()

#### Fine weather conditions recorded the most number of accidents.

In [121]:
side_by_side_plot(train,'weather_conditions','accident_risk_index',rot=70)

In [122]:
group_summary(train,'weather_conditions','accident_risk_index')

#### There is no significant correlation between the average accident risk index and the weather condition. But the accident risk index is low on snowy and high wind weather conditions.

### Let's compare the state-wise accident risk index with weather conditions.

In [123]:
sns.catplot(y='weather_conditions',x='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [124]:
group_summary(train,['state','weather_conditions'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar if any type of weather condition. Except for snowy and high wind weather conditions.

### Let's compare the state-wise total accident count by weather condition.

In [125]:
sns.catplot(y='weather_conditions',x='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'weather_conditions':'str'}).groupby(['state','weather_conditions'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [126]:
(train.astype({'weather_conditions':'str'}).groupby(['state','weather_conditions'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom,   higher number of accidents in fine weather condition.

### Let's compare the state-wise total accident casualties by weather condition.

In [129]:
sns.catplot(y='weather_conditions',x='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'weather_conditions':'str'}).groupby(['state','weather_conditions'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [130]:
(train.astype({'weather_conditions':'str'}).groupby(['state','weather_conditions'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom,   higher number of accident casualties in fine weather condition.

#### Let's see how accident risk index changes on rural and urban areas.

In [131]:
count_plot(train,'urban_or_rural_area')

In [132]:
train['urban_or_rural_area'].value_counts()

#### Most number of accidents in urban area.

In [133]:
side_by_side_plot(train,'urban_or_rural_area','accident_risk_index')

In [134]:
group_summary(train,'urban_or_rural_area','accident_risk_index')

#### There is no significant correlation between the average accident risk index and the area type.

### Let's compare the state-wise accident risk index with area types.

In [135]:
sns.catplot(x='urban_or_rural_area',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [136]:
group_summary(train,['state','urban_or_rural_area'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost similar even if the area is urban or rural.

### Let's compare the state-wise total accident count by area type.

In [137]:
sns.catplot(x='urban_or_rural_area',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'urban_or_rural_area':'str'}).groupby(['state','urban_or_rural_area'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [138]:
(train.astype({'urban_or_rural_area':'str'}).groupby(['state','urban_or_rural_area'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom,   higher number of accidents in urban area.

### Let's compare the state-wise total accident casualties by area types.

In [139]:
sns.catplot(x='urban_or_rural_area',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'urban_or_rural_area':'str'}).groupby(['state','urban_or_rural_area'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [140]:
(train.astype({'urban_or_rural_area':'str'}).groupby(['state','urban_or_rural_area'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom,   higher number of accident casualties in urban area.

#### Let's see whether the police officers have attended the accident spot or not.

In [141]:
count_plot(train,'did_police_officer_attend_scene_of_accident')

In [142]:
train['did_police_officer_attend_scene_of_accident'].value_counts()

#### Most road accidents scenes are attended by the police. 

In [143]:
side_by_side_plot(train,'did_police_officer_attend_scene_of_accident','accident_risk_index')

In [144]:
group_summary(train,'did_police_officer_attend_scene_of_accident','accident_risk_index')

#### The accident risk index same.

### Let's compare the state-wise accident risk index with whether the police officers have attended the accident spot or not.

In [145]:
sns.catplot(x='did_police_officer_attend_scene_of_accident',y='accident_risk_index',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=train,
                 height=4, aspect=.9, 
                kind="box");

In [146]:
group_summary(train,['state','did_police_officer_attend_scene_of_accident'],'accident_risk_index')

#### The above plot explains that in all states the accident risk index is almost same.

### Let's compare the state-wise total accident count with whether the police officers have attended the accident spot or not.

In [147]:
sns.catplot(x='did_police_officer_attend_scene_of_accident',y='count',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'did_police_officer_attend_scene_of_accident':'str'}).groupby(['state','did_police_officer_attend_scene_of_accident'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [148]:
(train.astype({'did_police_officer_attend_scene_of_accident':'str'}).groupby(['state','did_police_officer_attend_scene_of_accident'])
            ['accident_id'].agg({'count'}).reset_index().sort_values('count',ascending=False))

#### In all the states of the united kingdom,   higher number of accidents scenes are attended by the police. 

### Let's compare the state-wise total accident casualties with whether the police officers have attended the accident spot or not.

In [149]:
sns.catplot(x='did_police_officer_attend_scene_of_accident',y='sum',
                 col='state',col_wrap=3,sharey=False,sharex=False,
            data=(train.astype({'did_police_officer_attend_scene_of_accident':'str'}).groupby(['state','did_police_officer_attend_scene_of_accident'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False)),
                 height=4, aspect=.9,kind="bar");               

In [150]:
(train.astype({'did_police_officer_attend_scene_of_accident':'str'}).groupby(['state','did_police_officer_attend_scene_of_accident'])
            ['number_of_casualties'].agg({'sum'}).reset_index().sort_values('sum',ascending=False))

#### In all the states of the united kingdom, the higher number of accident casualty scenes are attended by the police. 

### By the police forces group-wise.Let's see how many police officers attend the accident scene and how many do not.

In [151]:
plt.figure(figsize=(20,8));
sns.countplot(x=train['police_force'],hue=train['did_police_officer_attend_scene_of_accident']);

### Let's do some feature engineering on date and time column.

In [152]:
def pre_process(df):

  df['date_time']=pd.to_datetime(df["date"].astype('str'))
  #df[['Year', 'Month', 'Day', 'Hour', 'Minute']]=df[['Year', 'Month', 'Day', 'Hour', 'Minute']].astype('str')
  #df['date'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']],infer_datetime_format=True,errors='coerce').dt.tz_localize(None)
  df['day']=df['date_time'].dt.day
  df['day_label']=df['date_time'].dt.day_name()
  df['day_number']=df['date_time'].dt.dayofweek
  df['month_number']=df['date_time'].dt.month
  df['month_label']=df['date_time'].dt.strftime('%b')
  df['year_quarter']=df['date_time'].dt.quarter
  df['week_of_year']=df['date_time'].dt.week
  df['year']= df['date_time'].dt.year
  df['dayofmonth'] = df['date_time'].dt.daysinmonth
  df['dayofyear'] = df['date_time'].dt.day_of_year
  df['hour']=pd.to_datetime(df["time"],errors='coerce').dt.hour



  def month2seasons(x):
    season=""
    if x in [12, 1, 2]:
        season = 'winter'
    elif x in [3, 4, 5]:
        season = 'spring'
    elif x in [6, 7, 8]:
        season = 'summer'
    elif x in [9,10, 11]:
        season = 'autumn'
    return season

 
  df['seasons']=df['month_number'].apply(month2seasons)


  def hours2timing(x):
    if x in range(20,23):
        timing = 'Night'
    elif x in range(5,12):
        timing = 'Morning'
    elif x in range(12, 16):
        timing = 'Afternoon'
    elif x in range(16, 20):
        timing = 'Evening'
    elif x in [23,0,1,2,3,4]:
        timing = 'Midnight'    
    else:
        timing = 'X'
    return timing



  df['timings']=df['hour'].apply(hours2timing) 



  return df

In [153]:
train=pre_process(train)

### Let's see over the one year how the count of police officers who attend the scene of the accident and those who do not.

In [154]:
plt.figure(figsize=(20,8));
sns.lineplot(data=train.groupby(['date_time','did_police_officer_attend_scene_of_accident'])['did_police_officer_attend_scene_of_accident'].agg({'count'}).reset_index(),
             x='date_time',y='count',hue='did_police_officer_attend_scene_of_accident')


### Month-wise,

In [155]:
plt.figure(figsize=(20,8));
sns.lineplot(data=train.groupby(['month_number','did_police_officer_attend_scene_of_accident'])['did_police_officer_attend_scene_of_accident'].agg({'count'}).reset_index(),
             x='month_number',y='count',hue='did_police_officer_attend_scene_of_accident')
plt.xticks(range(1,13));


#### Over the 12 months, the count of police officers who attend the accident scene is increased in the month of February, May, July, and November.

#### Over the 12 months, the count of police officers who do not attend the accident scene is increased in the month of February, November. 

### Week-wise,

In [156]:
plt.figure(figsize=(20,8));
sns.lineplot(data=train.groupby(['week_of_year','did_police_officer_attend_scene_of_accident'])['did_police_officer_attend_scene_of_accident'].agg({'count'}).reset_index(),
             x='week_of_year',y='count',hue='did_police_officer_attend_scene_of_accident')
plt.xticks(range(1,53));


### Hour-wise

In [157]:
plt.figure(figsize=(20,8));
sns.lineplot(data=train.groupby(['hour','did_police_officer_attend_scene_of_accident'])['did_police_officer_attend_scene_of_accident'].agg({'count'}).reset_index(),
             x='hour',y='count',hue='did_police_officer_attend_scene_of_accident')
plt.xticks(range(0,24));

### Day of Week wise,

In [158]:
plt.figure(figsize=(20,8));
sns.lineplot(data=train.groupby(['day_number','did_police_officer_attend_scene_of_accident'])['did_police_officer_attend_scene_of_accident'].agg({'count'}).reset_index(),
             x='day_number',y='count',hue='did_police_officer_attend_scene_of_accident')
plt.xticks(range(0,7),['Monday','Tuesday','Wednesday','Thursday','Friday', 'Saturday','Sunday']);  

### Let's see how the accident risk index changes over one year.

In [159]:
plt.figure(figsize=(20,6))
yr=sns.lineplot(x=train['date_time'],y=train['accident_risk_index'],color="Red")
yr.axhline(train.groupby('date_time')['accident_risk_index'].agg({'mean'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.legend();          

#### The above plot explains that over the one year the accident risk index is has had a lot of ups and downs.

#### In some months the accident risk index is very lower than the yearly average accident risk index.

### Let's see how the accident count changes over one year.

In [160]:
plt.figure(figsize=(20,6))
accident_total=sns.lineplot(data=train.groupby('date_time')['accident_id'].agg({'count'}).reset_index(),
               x='date_time',y='count',color="Red")
accident_total.axhline(train.groupby('date_time')['accident_id'].agg({'count'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.legend(); 

#### The above plot explains that over the one year the accident count is has had a lot of ups and downs.

#### In some months the accident count is very lower than the yearly average accident count.

### Let's see how the accident casualties changes over one year.

In [161]:
plt.figure(figsize=(20,6))
p=sns.lineplot(data=train.groupby('date_time')['number_of_casualties'].agg({'sum'}).reset_index(),
               x='date_time',y='sum',color="Red")
p.axhline(train.groupby('date_time')['number_of_casualties'].agg({'sum'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.legend(); 

#### The above plot explains that over the one year the accident casualties count is has had a lot of ups and downs.

#### In some months the accident casualties count is very lower than the yearly average accident casualties count.

### Let's see how the accident risk index changes over 12 months.

In [162]:
plt.figure(figsize=(20,6))
yr=sns.lineplot(x=train['month_number'],y=train['accident_risk_index'],color="Red")
yr.axhline(train.groupby('month_number')['accident_risk_index'].agg({'mean'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,13));         
plt.legend();          

#### The above plot explains that over the 12 months the accident risk index is has had a lot of ups and downs.

#### The month of August and after august, the accident risk index is very lower than the monthly average accident risk index.

### Let's see how the accident count changes over 12 months.

In [163]:
plt.figure(figsize=(20,6))
accident_total=sns.lineplot(data=train.groupby('month_number')['accident_id'].agg({'count'}).reset_index(),
               x='month_number',y='count',color="Red")
accident_total.axhline(train.groupby('month_number')['accident_id'].agg({'count'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,13));          
plt.legend(); 

#### The above plot explains that over the 12 months the accident count is has had a lot of ups and downs.

#### The month of June and after june, the accident count is very lower than the monthly average accident count.

### Let's see how the accident casualties changes over 12 months.

In [164]:
plt.figure(figsize=(20,6))
p=sns.lineplot(data=train.groupby('month_number')['number_of_casualties'].agg({'sum'}).reset_index(),
               x='month_number',y='sum',color="Red")
p.axhline(train.groupby('month_number')['number_of_casualties'].agg({'sum'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,13));          
plt.legend(); 

#### The above plot explains that over the 12 months the accident casualties is has had a lot of ups and downs.

#### The month of June and after june, the accident casualties is very lower than the monthly average accident casualties.

### Let's see how the accident risk index changes over 52 weeks of the year.

In [165]:
plt.figure(figsize=(20,6))
yr=sns.lineplot(x=train['week_of_year'],y=train['accident_risk_index'],color="Red")
yr.axhline(train.groupby('week_of_year')['accident_risk_index'].agg({'mean'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,53));         
plt.legend();          

#### The above plot explains that over the 52 weeks of the year accident risk index is has had a lot of ups and downs.

### Let's see how the accident count changes over 52 weeks of the year.

In [166]:
plt.figure(figsize=(20,6))
accident_total=sns.lineplot(data=train.groupby('week_of_year')['accident_id'].agg({'count'}).reset_index(),
               x='week_of_year',y='count',color="Red")
accident_total.axhline(train.groupby('week_of_year')['accident_id'].agg({'count'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,53));          
plt.legend(); 

#### The above plot explains that over the 53 weeks of the year the accident count is has had a lot of ups and downs.

### Let's see how the accident casualties changes over 52 weeks of the year.

In [167]:
plt.figure(figsize=(20,6))
p=sns.lineplot(data=train.groupby('week_of_year')['number_of_casualties'].agg({'sum'}).reset_index(),
               x='week_of_year',y='sum',color="Red")
p.axhline(train.groupby('week_of_year')['number_of_casualties'].agg({'sum'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(1,53));          
plt.legend(); 

#### The above plot explains that over the 52 weeks of the year the accident casualties is has had a lot of ups and downs.

### Let's see how the accident risk index changes over 24 hours of the day.

In [168]:
plt.figure(figsize=(20,6))
yr=sns.lineplot(x=train['hour'],y=train['accident_risk_index'],color="Red")
yr.axhline(train.groupby('hour')['accident_risk_index'].agg({'mean'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,24));         
plt.legend();          

#### The above plot explains that over 24 hours accident risk index is has had a lot of ups and downs.

#### From the time midnight 2'o clock to 5'o clock the accident risk index is lower than the per day accident risk index's  average

### Let's see how the accident count changes over 24 hours of the day.

In [169]:
plt.figure(figsize=(20,6))
accident_total=sns.lineplot(data=train.groupby('hour')['accident_id'].agg({'count'}).reset_index(),
               x='hour',y='count',color="Red")
accident_total.axhline(train.groupby('hour')['accident_id'].agg({'count'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,24));          
plt.legend(); 

#### The above plot explains that the 24 hours accident count is has had a lot of ups and downs.

#### From the time midnight 12'o clock to 5'o clock the accident count is lower than the per day accident counts average.

#### From the time morning 7'o clock to evening 7'o clock the accident count is higher than the per day accident counts average.

### Let's see how the accident casualties changes over 24 hours of the day.

In [170]:
plt.figure(figsize=(20,6))
p=sns.lineplot(data=train.groupby('hour')['number_of_casualties'].agg({'sum'}).reset_index(),
               x='hour',y='sum',color="Red")
p.axhline(train.groupby('hour')['number_of_casualties'].agg({'sum'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,24));          
plt.legend(); 

#### The above plot explains that the 24 hours accident casualties is has had a lot of ups and downs.

#### From the time midnight 12'o clock to 5'o clock the accident casualties is lower than the per day accident casualties average.


#### From the time morning 7'o clock to evening 7'o clock the accident casualties is higher than the per day accident casualties average.

### Let's see how the accident risk index changes over 7 days of the week. 

In [171]:
plt.figure(figsize=(20,6))
yr=sns.lineplot(x=train['day_number'],y=train['accident_risk_index'],color="Red")
yr.axhline(train.groupby('day_number')['accident_risk_index'].agg({'mean'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,7));         
plt.xticks(range(0,7),['Monday','Tuesday','Wednesday','Thursday','Friday', 'Saturday','Sunday']);        

#### The above plot explains that over 7 days of the week the accident risk index is has had a lot of ups and downs.

#### On Wednesday and weekends the accident risk index is lower than the per week accident risk index's  average

### Let's see how the accident count changes over 7 days of the week. 

In [172]:
plt.figure(figsize=(20,6))
accident_total=sns.lineplot(data=train.groupby('day_number')['accident_id'].agg({'count'}).reset_index(),
               x='day_number',y='count',color="Red")
accident_total.axhline(train.groupby('day_number')['accident_id'].agg({'count'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,7),['Monday','Tuesday','Wednesday','Thursday','Friday', 'Saturday','Sunday']);          
plt.legend(); 

#### The above plot explains that over 7 days of the week the accident count is has had a lot of ups and downs.

#### On Wednesday and weekends the accident count is lower than the per week accident counts average.

### Let's see how the accident casualties changes over 7 days of the week. 

In [173]:
plt.figure(figsize=(20,6))
p=sns.lineplot(data=train.groupby('day_number')['number_of_casualties'].agg({'sum'}).reset_index(),
               x='day_number',y='sum',color="Red")
p.axhline(train.groupby('day_number')['number_of_casualties'].agg({'sum'}).mean()[0],
          ls=":",color="Black",lw=5,label='Average');
plt.xticks(range(0,7),['Monday','Tuesday','Wednesday','Thursday','Friday', 'Saturday','Sunday']);           
plt.legend(); 

#### The above plot explains that over 7 days of the week the accident casualties is has had a lot of ups and downs.

#### On Wednesday and weekends the accident casualties is lower than the per week accident casualties average