In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from pathlib import Path
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from statsmodels.graphics.mosaicplot import mosaic
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
%run ../custom/jc-functions.ipynb

In [3]:
dataset = Path('../dataset')
df = pd.read_csv(dataset/"accidents_train.csv")
df.head()

Unnamed: 0,Num,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,1,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,2,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,3,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,5,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


## Features Selected per EDA

In [4]:
columns = ["Area_accident_occured", "Types_of_Junction", "Light_conditions",
           "Number_of_vehicles_involved", "Number_of_casualties", "Cause_of_accident",
           "Day_of_week", "Sex_of_driver", "Age_band_of_driver", 
           "Accident_severity"]
df1 = df[columns]


In [5]:
df1.shape

(8210, 10)

In [6]:
df1.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8210 entries, 0 to 8209
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Area_accident_occured        8050 non-null   object
 1   Types_of_Junction            8210 non-null   object
 2   Light_conditions             8210 non-null   object
 3   Number_of_vehicles_involved  8210 non-null   int64 
 4   Number_of_casualties         8210 non-null   int64 
 5   Cause_of_accident            8210 non-null   object
 6   Day_of_week                  8210 non-null   object
 7   Sex_of_driver                8210 non-null   object
 8   Age_band_of_driver           8210 non-null   object
 9   Accident_severity            8210 non-null   object
dtypes: int64(2), object(8)
memory usage: 4.3 MB


In [13]:
# Discrete features
disc_feat = ['Number_of_vehicles_involved', 'Number_of_casualties']
# Categorical features
cat_feat = ["Area_accident_occured", "Types_of_Junction", "Light_conditions",
           "Number_of_vehicles_involved", "Number_of_casualties", "Cause_of_accident",
           "Day_of_week", "Sex_of_driver", "Age_band_of_driver"]
# Target variable
target = ["Accident_severity"]

## Data Quality Reports

In [8]:
data_quality_report_cont(df1, disc_feat)

Data Quality for Continous Features
Total Features: 2
                       Feature  Count  Missing  % missing  Cardinality
0  Number_of_vehicles_involved   8210        0        0.0            6
1         Number_of_casualties   8210        0        0.0            8


Descriptive Stats
                              count  mean   std  min  25%  50%  75%  max
Number_of_vehicles_involved  8210.0  2.01  0.64  1.0  2.0  2.0  2.0  7.0
Number_of_casualties         8210.0  1.51  0.97  1.0  1.0  1.0  2.0  8.0


In [9]:
data_quality_report_cat(df1, cat_feat)

Data Quality Report for Categorical Features
Stats
-----
                       Feature  Count  Missing  % Missing  Cardinality
0        Area_accident_occured   8050      160       1.99           15
1            Types_of_Junction   8210        0       0.00            8
2             Light_conditions   8210        0       0.00            4
3  Number_of_vehicles_involved   8210        0       0.00            6
4         Number_of_casualties   8210        0       0.00            8
5            Cause_of_accident   8210        0       0.00           20
6                  Day_of_week   8210        0       0.00            7
7                Sex_of_driver   8210        0       0.00            3
8           Age_band_of_driver   8210        0       0.00            5


Mode 1
------
                       Feature         Mode 1  Mode 1 Freq.  Mode 1 %
0        Area_accident_occured          Other          2511     31.19
1            Types_of_Junction        Y Shape          3118     37.98
2      

#### Data clean up (Missing records)

In [10]:
df1['Area_accident_occured'] = df1['Area_accident_occured'].fillna(df1['Area_accident_occured'].mode()[0]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Area_accident_occured'] = df1['Area_accident_occured'].fillna(df1['Area_accident_occured'].mode()[0])


### Verify clean up

In [11]:
data_quality_report_cat(df1, cat_feat)

Data Quality Report for Categorical Features
Stats
-----
                       Feature  Count  Missing  % Missing  Cardinality
0        Area_accident_occured   8210        0        0.0           14
1            Types_of_Junction   8210        0        0.0            8
2             Light_conditions   8210        0        0.0            4
3  Number_of_vehicles_involved   8210        0        0.0            6
4         Number_of_casualties   8210        0        0.0            8
5            Cause_of_accident   8210        0        0.0           20
6                  Day_of_week   8210        0        0.0            7
7                Sex_of_driver   8210        0        0.0            3
8           Age_band_of_driver   8210        0        0.0            5


Mode 1
------
                       Feature         Mode 1  Mode 1 Freq.  Mode 1 %
0        Area_accident_occured          Other          2671     32.53
1            Types_of_Junction        Y Shape          3118     37.98
2      

## Export dataset

In [12]:
df1.to_csv('../dataset/accidents_clean_train.csv', index=False)