In [1]:
# Mathematical functions
import math
from scipy import stats 
# Data manipulation
import numpy as np
import pandas as pd

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Missing data imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from io import StringIO 

# Categorical data encoding
from sklearn.preprocessing import LabelEncoder

# Train-test split and k-fold cross validation
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif


# Classification algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Model evaluation
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
# Explainable AI
!pip install --quiet shap==0.39.0
import shap

# Warning suppression
import warnings
warnings.filterwarnings('ignore')
import acquire
import prepare 

Acquire CSV

In [2]:
get = acquire.get_data()

Prepare

In [3]:
df = prepare.prep_data(get)
df.head()

Unnamed: 0,Age_band_of_driver,Driving_experience,Area_accident_occured,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Casualty_class,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,...,Weather_conditions_Raining,Weather_conditions_Raining and Windy,Weather_conditions_Snow,Weather_conditions_Unknown,Weather_conditions_Windy,Sex_of_casualty_Male,Educational_level_Elementary,Educational_level_High School,Educational_level_High school,Educational_level_Junior High
0,2,4,2,3,2,2,1,2,3.0,1,...,0,0,0,0,0,0,0,1,0,0
1,3,3,1,3,2,2,1,2,3.0,1,...,0,0,0,0,0,0,0,0,0,1
2,2,4,9,3,2,2,1,3,3.0,1,...,0,0,0,0,0,1,0,0,0,1
3,2,1,1,2,2,2,2,2,3.0,1,...,0,0,0,0,0,0,0,0,0,1
4,2,2,4,2,2,2,1,2,3.0,1,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df = prepare.prep_data(get)
df.head()

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.replace('Unknown or other', 'other')
df = df.replace('Darkness - lights unlit', 'Darkness - no lighting')
df['Age_band_of_casualty'] = df['Age_band_of_casualty'].replace('5', 'Under 18')

In [None]:
df['baseline_prediction'] = np.where(df['Casualty_class']== 'Driver or rider', True, False)
df.head()

In [None]:
df['baseline_prediction'] = np.where(df['Casualty_class']== 'Driver or rider', 1 , 2)

In [None]:
df['baseline_prediction'].isnull().sum()

In [None]:
df.Weather_conditions.value_counts()

In [None]:
df['Age_band_of_casualty '] = df.Age_band_of_casualty .map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4})
df['Age_band_of_driver'] = df.Age_band_of_driver.map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4, 'Unknown' : 5})


df['Casualty_class'] = df.Age_band_of_casualty.map({'Driver or rider' : 1, 'Pedestrian' : 2, 'Passenger' : 3, 'na' : 10})                                                          

In [None]:
df.info()

In [None]:
# number of duplicate rows
duplicate = df[df.duplicated() == True]
print("# of duplicate rows: {}".format(len(duplicate)))

In [None]:
# number of missing values in columns
df.Educational_level.value_counts()

In [None]:
df['Work_of_casuality'].isnull().sum()

In [None]:
get.Weather_conditions.value_counts()

In [None]:
df.Driving_experience.value_counts()

In [None]:
df.loc[2:4, ['baseline_prediction', "Casualty_severity", 'Fitness_of_casuality']]

In [None]:
df.isnull().sum()

In [None]:
def prop_imputer(df):
    df_prop = df.copy(deep = True)
    missing_cols = df_prop.isna().sum()[df_prop.isna().sum() != 0].index.tolist()
    for col in missing_cols:
        values_col = df_prop[col].value_counts(normalize = True).index.tolist()
        probabilities_col = df_prop[col].value_counts(normalize = True).values.tolist()
        df_prop[col] = df_prop[col].fillna(pd.Series(np.random.choice(values_col, p = probabilities_col, size = len(df))))
    return df_prop

In [None]:
df_imp = prop_imputer(df)

In [None]:
df = df_imp
df.isnull().sum()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age_band_of_driver                    12316 non-null  int64  
 1   Driving_experience                    12316 non-null  int64  
 2   Area_accident_occured                 12316 non-null  int64  
 3   Light_conditions                      12316 non-null  int64  
 4   Number_of_vehicles_involved           12316 non-null  int64  
 5   Number_of_casualties                  12316 non-null  int64  
 6   Casualty_class                        12316 non-null  int64  
 7   Age_band_of_casualty                  12316 non-null  int64  
 8   Casualty_severity                     12316 non-null  float64
 9   Fitness_of_casuality                  12316 non-null  int64  
 10  Accident_severity                     12316 non-null  int64  
 11  Sex_of_driver_M

In [None]:
# encode
#df['Age_band_of_driver'] = df.Age_band_of_driver.map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4, 'Unknown' : 10})
#df['Educational_level'] = df.Educational_level.map({'Illiterate' : 1, 'Writing & reading' : 2, 'Elementary school' : 3, 'Junior high school' : 4, 'High school' : 5, 'Above high school' : 6, 'Unknown' : 10})
df['Driving_experience'] = df.Driving_experience.map({'Below 1yr' : 1, '1-2yr' : 2, '2-5yr' : 3, '5-10yr' : 4, 'Above 10yr' : 5, 'No Licence' : 0, 'unknown' : 10})
df['Light_conditions'] = df.Light_conditions.map({'Darkness - no lighting' : 1, 'Darkness - lights lit' : 2, 'Daylight' : 3})
df['Age_band_of_casualty'] = df.Age_band_of_casualty.map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4, 'na' : 10})
df['Fitness_of_casuality'] = df.Fitness_of_casuality.map({'Normal' : 1, 'NormalNormal' : 2, 'Deaf' : 3, 'Other' : 4, 'Blind' : 4})
#df['Accident_severity'] = df.Accident_severity.map({'Slight Injury' : 1, 'Serious Injury' : 2, 'Fatal injury' : 3})
df.head(10)

In [None]:
dummy_df = pd.get_dummies(df[['Area_accident_occured', \
                              'Weather_conditions', \
                              'Light_conditions', \
                              'Age_band_of_casualty', \
                              'Age_band_of_driver', \
                              'Sex_of_casualty',\
                              'Casualty_severity',\
                              'Accident_severity',\
                              'Educational_level']], dummy_na=False, \
                              drop_first=True)
    
    # Concat dummy dataframe to original 
df = pd.concat([df, dummy_df], axis=1)

In [None]:
df = df.drop(['Area_accident_occured', 'Accident_severity', 'Educational_level', 'Light_conditions', 'Weather_conditions', 'Age_band_of_casualty', 'Age_band_of_driver', 'Educational_level_Unknown', 'Educational_level_Illiterate', 'Time', 'Lanes_or_Medians', 'Vehicle_movement', 'Types_of_Junction', 'Pedestrian_movement', 'Cause_of_accident', 'Type_of_collision', 'Vehicle_driver_relation', 'Type_of_vehicle', 'Road_surface_type', 'Road_surface_conditions', 'Day_of_week','Road_allignment', 'Fitness_of_casuality', 'Work_of_casuality', 'Sex_of_driver', 'Sex_of_casualty', 'Defect_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle', 'Road_surface_type'], axis=1)



In [None]:
get.loc[2:15, ['Time', "Weather_conditions"]]

In [None]:
get.Time.min()
get.Time = pd.to_datetime(get.Time, infer_datetime_format=True)

In [None]:
get['year'] = pd.DatetimeIndex(get['Time']).year
get.head()

In [None]:
get['year'] = pd.DatetimeIndex(get['Time']).year

#### Explore

In [4]:
# split data and explore on train
def split(df, stratify_by='Casualty_class'):
    # split df into train_validate 
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)

    X_train = train.drop(columns=['Casualty_class'])
    y_train = train[['Casualty_class']]

    X_validate = validate.drop(columns=['Casualty_class'])
    y_validate = validate[['Casualty_class']]

    X_test = test.drop(columns=['Casualty_class'])
    y_test = test[['Casualty_class']]

    return train, X_train, X_validate, X_test, y_train, y_validate, y_test

In [5]:
train, X_train, X_validate, X_test, y_train, y_validate, y_test = split(df, stratify_by='Casualty_class')

In [None]:
sns.boxplot(data=train, y='Number_of_casualties')
plt.show()

In [6]:
y_train.head()

Unnamed: 0,Casualty_class
10242,1
2673,1
5201,2
11438,2
11556,3


In [None]:
X_train.columns

In [None]:
fig, ax =plt.subplots(1,2,figsize = (15,8))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("Casualty_class", hue="Sex_of_casualty_Male", 
              palette="magma", data=train, ax=ax[0])

ax2 = sns.countplot("Accident_severity", hue="Sex_of_casualty_Male", 
              palette="magma", data=train, ax=ax[1])

From this chart, generally there are more women in the casualty class, especially driver or rider. The count for women is also higher for accident severity.

In [None]:
# check values
train.Area_accident_occured.value_counts()

In [None]:
df2 = train[train['Area_accident_occured'] ==  " Recreational areas"]
df2

In [None]:
train.loc[2:4, ['Casualty_class']]

In [None]:
correlation_matrix = train[['Number_of_casualties','Number_of_vehicles_involved']].corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

there is not much correlation to these variables 

In [None]:
train.Fitness_of_casuality.value_counts()

In [None]:
train.columns

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df, y='Number_of_vehicles_involved', x='Number_of_casualties')
plt.show()

In [None]:
train.Area_accident_occured.value_counts()

In [None]:
var = ["Age_band_of_casualty","Area_accident_occured", "Light_conditions"]
for v in var:
    sns.set(style="darkgrid")
    sns.countplot(x=v, data=train)
    plt.show()

### Does age have anything to do with casualty?

From these charts I can see that the highest age range for accidents is 18-30

In [None]:
train.Educational_level.value_counts()

In [None]:
grid = sns.FacetGrid(data=df, col='Light_conditions', height=4, aspect=1, sharey=False)
# mapping bar plot and the data on to the grid
grid.map(sns.countplot, 'Number_of_casualties', palette=['black', 'brown', 'orange'])
plt.show()

A majority of accidents happen in the daytime and at night with lighting.

In [None]:
target_count = train['Casualty_class'].value_counts()

target_count.plot(kind='bar', title='Count (target)');

In [None]:
train.head()

#### Testing 

In [None]:
a=train.Casualty_class
b=train.Light_conditions
observed = pd.crosstab(a,b)
observed

Ho - light conditions effects casualty class

Ha - light conditions do not effect it 

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
alpha = 0.05
print(f'chi2 = {chi2:.2f}')
print(f'p value: {p:.4f}')
if p < alpha:
      print('We can reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

Ho - Driving experiance effects casualty class

Ha - Driving experiance does not effect it

In [None]:
a2=train.Casualty_class
b2=train.Driving_experience
observed2 = pd.crosstab(a2,b2)
observed2

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed2)
alpha = 0.05
print(f'chi2 = {chi2:.2f}')
print(f'p value: {p:.4f}')
if p < alpha:
      print('We can reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

In [None]:
df1 = train[train.isna().any(axis=1)]
df1

In [None]:
df2 = train[train['baseline_prediction'] ==  "no"]
df2

In [7]:
# Chi-square feature selection
def feature_chi2(X_train, X_validate, X_test, k = 25):
     
    # Feature selection
    fs = SelectKBest(score_func = chi2, k = k)
    fs.fit(X_train, y_train)
    
    # Selected columns
    cols = fs.get_support(indices = True)
    
    # Output data
    X_train_fs = X_train.iloc[:, cols]
    X_validate_fs = X_validate.iloc[:, cols]
    X_test_fs = X_test.iloc[:, cols]
    
    return X_train_fs, X_validate_fs, X_test_fs

In [9]:
X_train_fs, X_validate_fs, X_test_fs = feature_chi2(X_train, X_validate, X_test, k = 25)

In [10]:
X_train_fs.head()

Unnamed: 0,Age_band_of_driver,Driving_experience,Area_accident_occured,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,Accident_severity,...,Weather_conditions_Raining,Weather_conditions_Raining and Windy,Weather_conditions_Snow,Weather_conditions_Unknown,Weather_conditions_Windy,Sex_of_casualty_Male,Educational_level_Elementary,Educational_level_High School,Educational_level_High school,Educational_level_Junior High
10242,2,2,9,3,3,1,2,3.0,1,1,...,0,0,0,0,0,0,0,0,0,1
2673,2,1,9,3,1,1,2,3.0,1,1,...,0,0,0,0,0,0,0,0,0,0
5201,2,1,1,3,2,1,2,3.0,1,1,...,0,0,0,0,0,0,1,0,0,0
11438,2,3,9,3,2,1,3,3.0,1,1,...,0,0,0,0,0,0,0,0,0,1
11556,2,1,5,3,2,2,2,3.0,1,1,...,1,0,0,0,0,1,0,0,0,1


In [None]:
baseline.head()

In [None]:
train.Casualty_class.value_counts()

In [None]:
train.shape

In [None]:
train[train['Casualty_class'] ==  1].shape

In [None]:
predictions['baseline'] = train[train['Casualty_class'] == 1].shape[0] /train.shape[0]
predictions.head()

In [None]:
predictions = pd.DataFrame({
    'actual': train.Casualty_class
})

In [None]:
predictions.head()

### Random Forest

In [None]:
from sklearn.metrics import classification_report
# assign model
forest1 = RandomForestClassifier(max_depth=3, random_state=123)
#fit model
forest1.fit(X_train_fs, y_train)
y_predictions = forest1.predict(X_train_fs)
predictions['RF'] = forest1.predict(X_train_fs)

report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 4")
pd.DataFrame(report)

In [None]:
from sklearn.metrics import mean_squared_error
# evaluate accuracy on dt
MSE = mean_squared_error(predictions.actual, predictions.RF)
RMSE = math.sqrt(MSE)
predictions['RF'] = RMSE
print("Root Mean Square Error:\n")
print(RMSE)

### Decision Tree 

Max depth 3

In [None]:
# call the model
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)
# fit the model
tree1 = tree1.fit(X_train_fs, y_train)
y_predictions = tree1.predict(X_train_fs)
predictions['DT'] = tree1.predict(X_train_fs)
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 7")
pd.DataFrame(report)

In [None]:
# evaluate accuracy on dt
MSE = mean_squared_error(predictions.actual, predictions.DT)
RMSE = math.sqrt(MSE)
predictions['DT'] = RMSE
print("Root Mean Square Error:\n")
print(RMSE)

### Decision Tree

max depth 5

In [None]:
# call the model
tree2 = DecisionTreeClassifier(max_depth=7, random_state=123)
# fit the model
tree2 = tree1.fit(X_train_fs, y_train)
y_predictions = tree2.predict(X_train_fs)
predictions['DT2'] = tree2.predict(X_train_fs)
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 5")
pd.DataFrame(report)

In [None]:
# evaluate accuracy on dt2
MSE = mean_squared_error(predictions.actual, predictions.DT2)
RMSE = math.sqrt(MSE)
predictions['DT2'] = RMSE
print("Root Mean Square Error:\n")
print(RMSE)

In [None]:
predictions.head()

#### Model on validate

In [None]:
pred_Valid = pd.DataFrame({
    'actual': y_validate.Casualty_class
})

In [None]:
pred_Valid['baseline'] = train[train['Casualty_class'] == 1].shape[0] /train.shape[0]
pred_Valid.head()

In [None]:
# RF on valid and get RMSE
pred_Valid['RF_Valid'] = forest1.predict(X_validate_fs)
MSE = mean_squared_error(pred_Valid.actual, pred_Valid.RF_Valid)
RMSE = math.sqrt(MSE)
pred_Valid['RF_Valid'] = RMSE

In [None]:
# dt on valid and get RMSE
pred_Valid['DT_Valid'] = tree1.predict(X_validate_fs)
MSE = mean_squared_error(pred_Valid.actual, pred_Valid.DT_Valid)
RMSE = math.sqrt(MSE)
pred_Valid['DT_Valid'] = RMSE

In [None]:
# dt2 on valid and get RMSE
pred_Valid['DT2_Valid'] = tree2.predict(X_validate_fs)
MSE = mean_squared_error(pred_Valid.actual, pred_Valid.DT2_Valid)
RMSE = math.sqrt(MSE)
pred_Valid['DT2_Valid'] = RMSE

In [None]:
pred_Valid.head()

### Test 

Random forest model was the best on validate

In [None]:
# df to hold predictions 
pred_Test = pd.DataFrame({
    'actual': y_test.Casualty_class
})
pred_Test['baseline'] = train[train['Casualty_class'] == 1].shape[0] /train.shape[0]
pred_Test.head()

In [None]:
pred_Test['RF_Test'] = forest1.predict(X_test_fs)
MSE = mean_squared_error(pred_Test.actual, pred_Test.RF_Test)
RMSE = math.sqrt(MSE)
pred_Test['RF_Test'] = RMSE
pred_Test.head()