
<div style="color: white; 
            display: fill; 
            border-radius: 5px;
            background-color: black;
            font-size: 200%;
            font-family: cursive;
            text-align: center">
EDA (RTC Severity)
</div><p><a id="top"></a>
EDA Lists:</p>
<ul>
<li><a href="#1st_road">1st_Road_Class and 1st_Road_Number</a></li>
<li><a href="#2nd_road">2nd_Road_Class and 2nd_Road_Number</a></li>
<li><a href="#carriageway_hazards">Carriageway_Hazards</a></li>
<li><a href="#date">Date</a></li>
<li><a href="#accident_severity">Accident_Severity</a></li>
<li><a href="#day_of_week">Day_of_Week</a></li>
</ul>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option("display.max_columns", None)
plt.style.use("default")

In [None]:
CBLUEBG = "\33[44m"
CEND = "\33[0m"

In [None]:
df_accident = pd.read_parquet("../data/accident_data.parquet")
df_accident.shape

In [None]:
df_accident.tail(3)

In [None]:
msno.bar(df_accident);



<p><a href="#top">START</a>
<a id="1st_road"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
[Feature: 1st_Road_Class and 1st_Road_Number]
</div>


In [None]:
print(f"Unique Values in {CBLUEBG} 1st_Road_Class {CEND}: {df_accident['1st_Road_Class'].nunique()}, \n{df_accident['1st_Road_Class'].unique()}")


In [None]:
#utility function
def plot_road_class(data, ax, title):
    sns.histplot(x=data, bins=50, ax=ax).set(title=f"Road Class - {title}");


In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
ax = ax.flatten()

plot_road_class(df_accident[df_accident["1st_Road_Class"] == "A"]["1st_Road_Number"], ax[0], "A")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "B"]["1st_Road_Number"], ax[1], "B")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "C"]["1st_Road_Number"], ax[2], "C")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "Motorway"]["1st_Road_Number"], ax[3], "Motorway")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "A(M)"]["1st_Road_Number"], ax[4], "A(M)")
plot_road_class(df_accident[df_accident["1st_Road_Class"].isna()]["1st_Road_Number"], ax[5], "None")

plt.tight_layout()



<div style="border-radius: 5px;
            font-size: 120%;
            text-decoration: underline;
            font-family: cursive;
            text-align: left">
Observations:
</div><ul>
<li>Most values are 0 for all classes (A, B, C, Motorway, A(M), None)</li>
<li>In real-world situations, it's not possible the road numbers are 0 for all classes</li>
<li>This <code>1st_Road_Number</code> feature should be dropped as the inputs may be wrong</li>
</ul>



<p><a href="#top">START</a>
<a id="2nd_road"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
Feature: 2nd_Road_Class and 2nd_Road_Number
</div>


In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
ax = ax.flatten()

plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "A"]["2nd_Road_Number"], ax[0], "A")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "B"]["2nd_Road_Number"], ax[1], "B")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "C"]["2nd_Road_Number"], ax[2], "C")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "Motorway"]["2nd_Road_Number"], ax[3], "Motorway")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "A(M)"]["2nd_Road_Number"], ax[4], "A(M)")
plot_road_class(df_accident[df_accident["2nd_Road_Class"].isna()]["2nd_Road_Number"], ax[5], "None")

plt.tight_layout()


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))

plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "Unclassified"]["2nd_Road_Number"], ax, "Unclassified")



<div style="border-radius: 5px;
            font-size: 120%;
            text-decoration: underline;
            font-family: cursive;
            text-align: left">
Observations:
</div><ul>
<li>Most values are 0 for all classes (A, B, C, Motorway, A(M), None, Unclassified)</li>
<li>In real-world situations, it's not possible the road numbers are 0 for all classes</li>
<li>This <code>2nd_Road_Number</code> feature should be dropped as the inputs may be wrong</li>
</ul>


In [None]:
df_accident[["1st_Road_Number", "2nd_Road_Number"]].describe().T



<p><a href="#top">START</a>
<a id="carriageway_hazards"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
Feature: Carriageway_Hazards
</div>


In [None]:
print(f"Unique Values in {CBLUEBG} Carriageway_Hazards {CEND}: {df_accident['Carriageway_Hazards'].nunique()}, \n{df_accident['Carriageway_Hazards'].unique()}")


In [None]:
sns.histplot(y=df_accident["Carriageway_Hazards"], bins=50);



<h4 id="Checking-if-the-values-are-None-or-missing">Checking if the values are <code>None</code> or missing<a class="anchor-link" href="#Checking-if-the-values-are-None-or-missing">¶</a></h4>


In [None]:
print(f"{CBLUEBG}Missing values{CEND}: {df_accident['Carriageway_Hazards'].isna().sum()}")


In [None]:
df_accident[df_accident["Carriageway_Hazards"] == "None"].head(3)


In [None]:
df_accident[df_accident["Carriageway_Hazards"].isna()].head(3)



<div style="border-radius: 5px;
            font-size: 120%;
            text-decoration: underline;
            font-family: cursive;
            text-align: left">
Observations:
</div><ul>
<li>Most values are <code>None</code> which means most of the time there were no carriageway hazards on the road</li>
<li>Both missing values and <code>None</code> labelled values are interpreted by pandas as <code>None</code> values.</li>
</ul>



<p><a href="#top">START</a>
<a id="date"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
Feature: Date
</div>


In [None]:
print(f"Data type of Date Column: {df_accident['Date'].dtype}")


In [None]:
df_accident["Date"][:7]



<h3 id="Task-3.1:">Task-3.1:<a class="anchor-link" href="#Task-3.1:">¶</a></h3><p>Converting <code>Date</code> column from Object to Datetime</p>


In [None]:
df_accident["Date"] =  pd.to_datetime(df_accident["Date"], format="%d/%m/%Y")
print(f"Data type of Date Column: {df_accident['Date'].dtype}")


In [None]:
df_accident.shape



<h3 id="Task-3.2:">Task-3.2:<a class="anchor-link" href="#Task-3.2:">¶</a></h3><p>Adding extra 2 features to the dataset (Day and Month)</p>


In [None]:
df_accident["Day"] = df_accident["Date"].apply(lambda x:x.day)
df_accident["Month"] = df_accident["Date"].apply(lambda x:x.month)
df_accident.shape



<h3 id="Graph-3.1">Graph-3.1<a class="anchor-link" href="#Graph-3.1">¶</a></h3><p><code>Day</code> with <code>Accident_Severity</code></p>


In [None]:
plt.figure(figsize=(20, 7))
sns.histplot(df_accident, x="Day", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=100);



<p><strong>Observation:</strong> <code>Fatal</code> data is too less to be seen in the graph</p>
<h4 id="Graph-3.1.1">Graph-3.1.1<a class="anchor-link" href="#Graph-3.1.1">¶</a></h4><p>Different graph for <code>Fatal</code> case</p>


In [None]:
plt.figure(figsize=(20, 7))
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"], 
             x="Day", palette="coolwarm", alpha=1.0, bins=75).set(title=f"Accident_Severity: Fatal");



<h3 id="Graph-3.2">Graph-3.2<a class="anchor-link" href="#Graph-3.2">¶</a></h3><p><code>Month</code> with <code>Accident_Severity</code></p>


In [None]:
plt.figure(figsize=(20, 7))
sns.histplot(df_accident, x="Month", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=25);



<p><strong>Observation:</strong> <code>Fatal</code> data is too less to be seen in the graph</p>
<h4 id="Graph-3.2.1">Graph-3.2.1<a class="anchor-link" href="#Graph-3.2.1">¶</a></h4><p>Different graph for <code>Fatal</code> case</p>


In [None]:
plt.figure(figsize=(20, 7))
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"], 
             x="Month", palette="coolwarm", alpha=1.0, bins=25).set(title=f"Accident_Severity: Fatal");



<p><a href="#top">START</a>
<a id="accident_severity"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
Feature: Accident_Severity
</div>


In [None]:
df_accident["Accident_Severity"].value_counts()


In [None]:
#percentage calculation
df_accident["Accident_Severity"].value_counts()/len(df_accident)*100


In [None]:
labels = ["Slight", "Serious", "Fatal"]
plt.pie(df_accident["Accident_Severity"].value_counts(), labels = labels, autopct="%.2f%%");



<div style="border-radius: 5px;
            font-size: 120%;
            text-decoration: underline;
            font-family: cursive;
            text-align: left">
Observations:
</div><ul>
<li><code>Fatal</code> case data is only 1.38%. So the model will be biased for <code>Slight</code> cases.</li>
</ul>



<p><a href="#top">START</a>
<a id="day_of_week"></a></p>
<div style="color: black; 
            display: fill; 
            border-radius: 5px;
            background-color: yellow;
            font-size: 150%;
            font-family: cursive;
            text-align: left">
Feature: Day_of_Week
</div>


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df_accident, x="Day_of_Week", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"], 
             x="Day_of_Week", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal");



<div style="border-radius: 5px;
            font-size: 120%;
            text-decoration: underline;
            font-family: cursive;
            text-align: left">
Observations:
</div><ul>
<li>Weekends (Saturday and Sunday) have most fatal accidents</li>
</ul>
