### The data used for this project has been sourced from the Department of Transport dataset (https://roadtraffic.dft.gov.uk/custom-downloads). 

In [2]:
# importing dependencies
import pandas as pd
import numpy as np
import scipy as st
import matplotlib.pyplot as plt

In [3]:
# defining csv paths
csv_path1 = "source_data/Road_Casualties_1.csv"
csv_path2 = "source_data/Road_Casualties_2.csv"

Two reports have been run off our data source:
1) The first is 'casualties1 which will contain information about the following characteristics surrounding fatal road incidents in the UK: Sex, Age, Road Type, and Weather Condition. 

2) The second is 'casualties2' which will contain information about the following characteristics surrounding fatal road incidents in the UK: Road User, Speed Limit, Accident Month, and Accident Hour.

# Casualties1 Cleanup

In [4]:
# reading csv files
casualties1 = pd.read_csv(csv_path1)
casualties2 = pd.read_csv(csv_path2)

In [5]:
# creating dataframe 1
casualties1_df = pd.DataFrame(casualties1)
casualties1_df

Unnamed: 0,Accident year,Casualty severity,Casualty sex,Casualty age,Road type,Weather condition,Casualties,Unnamed: 7
0,2017.0,Killed,Male,1.0,Dual carriageway,Fine no high winds,1.0,
1,2017.0,Killed,Male,2.0,Single carriageway,Fine no high winds,1.0,
2,2017.0,Killed,Male,3.0,Single carriageway,Fine no high winds,1.0,
3,2017.0,Killed,Male,4.0,Single carriageway,Fine no high winds,2.0,
4,2017.0,Killed,Male,5.0,Single carriageway,Fine no high winds,2.0,
...,...,...,...,...,...,...,...,...
2887,2021.0,Killed,Female,93.0,Dual carriageway,Raining + high winds,1.0,
2888,2021.0,Killed,Female,93.0,Single carriageway,Fine no high winds,1.0,
2889,2021.0,Killed,Female,94.0,Single carriageway,Fine no high winds,1.0,
2890,2021.0,Killed,Female,97.0,Dual carriageway,Fine no high winds,1.0,


In [6]:
# removing last 2 columns and last row
casualties1_df = casualties1_df.drop(casualties1_df.columns[-2:], axis=1)
casualties1_df = casualties1_df[:-1]
casualties1_df

Unnamed: 0,Accident year,Casualty severity,Casualty sex,Casualty age,Road type,Weather condition
0,2017.0,Killed,Male,1.0,Dual carriageway,Fine no high winds
1,2017.0,Killed,Male,2.0,Single carriageway,Fine no high winds
2,2017.0,Killed,Male,3.0,Single carriageway,Fine no high winds
3,2017.0,Killed,Male,4.0,Single carriageway,Fine no high winds
4,2017.0,Killed,Male,5.0,Single carriageway,Fine no high winds
...,...,...,...,...,...,...
2886,2021.0,Killed,Female,93.0,Dual carriageway,Fine no high winds
2887,2021.0,Killed,Female,93.0,Dual carriageway,Raining + high winds
2888,2021.0,Killed,Female,93.0,Single carriageway,Fine no high winds
2889,2021.0,Killed,Female,94.0,Single carriageway,Fine no high winds


In [7]:
# checking data types
print(casualties1_df.dtypes)

Accident year        float64
Casualty severity     object
Casualty sex          object
Casualty age         float64
Road type             object
Weather condition     object
dtype: object


In [8]:
# converting floats to integers
casualties1_df["Accident year"] = casualties1_df["Accident year"].astype(int)
casualties1_df["Casualty age"] = casualties1_df["Casualty age"].astype(int)
print(casualties1_df.dtypes)

Accident year         int32
Casualty severity    object
Casualty sex         object
Casualty age          int32
Road type            object
Weather condition    object
dtype: object


In [9]:
# Final casualties1 (cleaned)
casualties1_df

Unnamed: 0,Accident year,Casualty severity,Casualty sex,Casualty age,Road type,Weather condition
0,2017,Killed,Male,1,Dual carriageway,Fine no high winds
1,2017,Killed,Male,2,Single carriageway,Fine no high winds
2,2017,Killed,Male,3,Single carriageway,Fine no high winds
3,2017,Killed,Male,4,Single carriageway,Fine no high winds
4,2017,Killed,Male,5,Single carriageway,Fine no high winds
...,...,...,...,...,...,...
2886,2021,Killed,Female,93,Dual carriageway,Fine no high winds
2887,2021,Killed,Female,93,Dual carriageway,Raining + high winds
2888,2021,Killed,Female,93,Single carriageway,Fine no high winds
2889,2021,Killed,Female,94,Single carriageway,Fine no high winds


# Casualties2

In [14]:
# creating dataframe 2
casualties2_df = pd.DataFrame(casualties2)
casualties2_df

Unnamed: 0,Accident year,Casualty severity,Road user,Speed limit,Accident month,Accident hour,Casualties,Unnamed: 7
0,2017.0,Killed,Pedestrian,1-20 mph,January,14.0,1.0,
1,2017.0,Killed,Pedestrian,1-20 mph,January,21.0,1.0,
2,2017.0,Killed,Pedestrian,1-20 mph,March,12.0,1.0,
3,2017.0,Killed,Pedestrian,1-20 mph,March,22.0,1.0,
4,2017.0,Killed,Pedestrian,1-20 mph,April,2.0,1.0,
...,...,...,...,...,...,...,...,...
6109,2021.0,Killed,Other vehicle,51-60 mph,May,11.0,1.0,
6110,2021.0,Killed,Other vehicle,51-60 mph,July,10.0,1.0,
6111,2021.0,Killed,Other vehicle,51-60 mph,September,21.0,1.0,
6112,2021.0,Killed,Other vehicle,51-60 mph,November,10.0,1.0,


In [15]:
# removing last 2 columns and last row
casualties2_df = casualties2_df.drop(casualties2_df.columns[-2:], axis=1)
casualties2_df = casualties2_df[:-1]
casualties2_df

Unnamed: 0,Accident year,Casualty severity,Road user,Speed limit,Accident month,Accident hour
0,2017.0,Killed,Pedestrian,1-20 mph,January,14.0
1,2017.0,Killed,Pedestrian,1-20 mph,January,21.0
2,2017.0,Killed,Pedestrian,1-20 mph,March,12.0
3,2017.0,Killed,Pedestrian,1-20 mph,March,22.0
4,2017.0,Killed,Pedestrian,1-20 mph,April,2.0
...,...,...,...,...,...,...
6108,2021.0,Killed,Other vehicle,51-60 mph,April,16.0
6109,2021.0,Killed,Other vehicle,51-60 mph,May,11.0
6110,2021.0,Killed,Other vehicle,51-60 mph,July,10.0
6111,2021.0,Killed,Other vehicle,51-60 mph,September,21.0


In [16]:
# checking data types
print(casualties2_df.dtypes)

Accident year        float64
Casualty severity     object
Road user             object
Speed limit           object
Accident month        object
Accident hour        float64
dtype: object


In [17]:
# converting floats to integers
casualties2_df["Accident year"] = casualties2_df["Accident year"].astype(int)
casualties2_df["Accident hour"] = casualties2_df["Accident hour"].astype(int)
print(casualties2_df.dtypes)

Accident year         int32
Casualty severity    object
Road user            object
Speed limit          object
Accident month       object
Accident hour         int32
dtype: object


In [20]:
# final casualties2 (cleaned)
casualties2_df

Unnamed: 0,Accident year,Casualty severity,Road user,Speed limit,Accident month,Accident hour
0,2017,Killed,Pedestrian,1-20 mph,January,14
1,2017,Killed,Pedestrian,1-20 mph,January,21
2,2017,Killed,Pedestrian,1-20 mph,March,12
3,2017,Killed,Pedestrian,1-20 mph,March,22
4,2017,Killed,Pedestrian,1-20 mph,April,2
...,...,...,...,...,...,...
6108,2021,Killed,Other vehicle,51-60 mph,April,16
6109,2021,Killed,Other vehicle,51-60 mph,May,11
6110,2021,Killed,Other vehicle,51-60 mph,July,10
6111,2021,Killed,Other vehicle,51-60 mph,September,21
