In [1]:
# Mathematical functions
import math
from scipy import stats 
# Data manipulation
import numpy as np
import pandas as pd

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Missing data imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from io import StringIO 

# Categorical data encoding
from sklearn.preprocessing import LabelEncoder

# Train-test split and k-fold cross validation
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif


# Classification algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Model evaluation
from sklearn import metrics
from sklearn.metrics import f1_score

# Explainable AI
!pip install --quiet shap==0.39.0
import shap

# Warning suppression
import warnings
warnings.filterwarnings('ignore')
import acquire
import prepare 

Acquire CSV

In [2]:
get = acquire.get_data()

Prepare

In [3]:
df = prepare.prep_data(get)
df.head()

Unnamed: 0,Driving_experience,Number_of_vehicles_involved,Number_of_casualties,Casualty_class,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,...,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown,Sex_of_casualty_Male,Accident_severity_Serious Injury,Accident_severity_Slight Injury,Educational_level_Elementary school,Educational_level_High school,Educational_level_Junior high school,Educational_level_Writing & reading
0,2,2,2,Driver or rider,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5,2,2,Driver or rider,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
2,2,2,2,Driver or rider,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
3,4,2,2,Pedestrian,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,3,2,2,Driver or rider,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0


In [None]:
df.Casualty_severity.value_counts()

In [None]:
# number of duplicate rows
duplicate = df[df.duplicated() == True]
print("# of duplicate rows: {}".format(len(duplicate)))

In [None]:
# number of missing values in columns
df.Educational_level.value_counts()

In [None]:
# plot target
plt.figure(figsize = (14, 7))
plt.subplot(1, 2, 1)
sns.countplot(x = df['Casualty_class'])
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
sns.despine(top = True, right = True)

In [None]:
df = df.replace('Unknown or other', 'other')
df = df.replace('Darkness - lights unlit', 'Darkness - no lighting')
df['Age_band_of_casualty'] = df['Age_band_of_casualty'].replace('5', 'Under 18')

In [None]:
df['Work_of_casuality'].isnull().sum()

In [None]:
df.shape

In [None]:
df.Casualty_severity.value_counts()

In [None]:
df['Age_band_of_casualty'].value_counts()

In [None]:
# fill missing values with mode column values
df['Driving_experience'].fillna(df['Driving_experience'].mode()[0], inplace=True)
df['Age_band_of_driver'].fillna(df['Age_band_of_driver'].mode()[0], inplace=True)
df['Age_band_of_casualty'].fillna(df['Age_band_of_casualty'].mode()[0], inplace=True)
df['Type_of_vehicle'].fillna(df['Type_of_vehicle'].mode()[0], inplace=True)
df['Area_accident_occured'].fillna(df['Area_accident_occured'].mode()[0], inplace=True)
df['Road_allignment'].fillna(df['Road_allignment'].mode()[0], inplace=True)
df['Type_of_collision'].fillna(df['Type_of_collision'].mode()[0], inplace=True)
df['Vehicle_movement'].fillna(df['Vehicle_movement'].mode()[0], inplace=True)
df['Lanes_or_Medians'].fillna(df['Lanes_or_Medians'].mode()[0], inplace=True)
df['Types_of_Junction'].fillna(df['Types_of_Junction'].mode()[0], inplace=True)

In [None]:
def prop_imputer(df):
    df_prop = df.copy(deep = True)
    missing_cols = df_prop.isna().sum()[df_prop.isna().sum() != 0].index.tolist()
    for col in missing_cols:
        values_col = df_prop[col].value_counts(normalize = True).index.tolist()
        probabilities_col = df_prop[col].value_counts(normalize = True).values.tolist()
        df_prop[col] = df_prop[col].fillna(pd.Series(np.random.choice(values_col, p = probabilities_col, size = len(df))))
    return df_prop

In [None]:
df_imp = prop_imputer(df)

In [None]:
df = df_imp
df.isnull().sum()

In [None]:
df.Casualty_severity.value_counts()

In [None]:
df.Accident_severity.value_counts()

In [None]:
# encode
#df['Age_band_of_driver'] = df.Age_band_of_driver.map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4, 'Unknown' : 10})
#df['Educational_level'] = df.Educational_level.map({'Illiterate' : 1, 'Writing & reading' : 2, 'Elementary school' : 3, 'Junior high school' : 4, 'High school' : 5, 'Above high school' : 6, 'Unknown' : 10})
df['Driving_experience'] = df.Driving_experience.map({'Below 1yr' : 1, '1-2yr' : 2, '2-5yr' : 3, '5-10yr' : 4, 'Above 10yr' : 5, 'No Licence' : 0, 'unknown' : 10})
#df['Light_conditions'] = df.Light_conditions.map({'Darkness - no lighting' : 1, 'Darkness - lights lit' : 2, 'Daylight' : 3})
#df['Age_band_of_casualty'] = df.Age_band_of_casualty.map({'Under 18' : 1, '18-30' : 2, '31-50' : 3, 'Over 51' : 4, 'na' : 10})
#df['Casualty_severity'] = df.Casualty_severity.map({'1' : 1, '2' : 2, '3' : 3, 'NaN' : 10})
#df['Accident_severity'] = df.Accident_severity.map({'Slight Injury' : 1, 'Serious Injury' : 2, 'Fatal injury' : 3})
df.head(10)

In [None]:
dummy_df = pd.get_dummies(df[['Area_accident_occured', \
                              'Weather_conditions', \
                              'Light_conditions', \
                              'Age_band_of_casualty', \
                              'Age_band_of_driver', \
                              'Sex_of_casualty',\
                              'Casualty_severity',\
                              'Accident_severity',\
                              'Educational_level']], dummy_na=False, \
                              drop_first=True)
    
    # Concat dummy dataframe to original 
df = pd.concat([df, dummy_df], axis=1)

In [None]:
df = df.drop(['Area_accident_occured', 'Accident_severity', 'Educational_level', 'Light_conditions', 'Weather_conditions', 'Age_band_of_casualty', 'Age_band_of_driver', 'Educational_level_Unknown', 'Educational_level_Illiterate', 'Time', 'Lanes_or_Medians', 'Vehicle_movement', 'Types_of_Junction', 'Pedestrian_movement', 'Cause_of_accident', 'Type_of_collision', 'Vehicle_driver_relation', 'Type_of_vehicle', 'Road_surface_type', 'Road_surface_conditions', 'Day_of_week','Road_allignment', 'Fitness_of_casuality', 'Work_of_casuality', 'Sex_of_driver', 'Sex_of_casualty', 'Defect_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle', 'Road_surface_type'], axis=1)



In [None]:
df = df.drop(['Area_accident_occured'], axis=1)

In [None]:
df.columns

In [None]:
df.Educational_level.value_counts()

In [None]:
train.Work_of_casuality.value_counts()

#### Explore

In [4]:
# split data and explore on train
def split(df, stratify_by='Casualty_class'):
    # split df into train_validate 
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)

    X_train = train.drop(columns=['Casualty_class'])
    y_train = train[['Casualty_class']]

    X_validate = validate.drop(columns=['Casualty_class'])
    y_validate = validate[['Casualty_class']]

    X_test = test.drop(columns=['Casualty_class'])
    y_test = test[['Casualty_class']]

    return train, X_train, X_validate, X_test, y_train, y_validate, y_test

In [5]:
train, X_train, X_validate, X_test, y_train, y_validate, y_test = split(df, stratify_by='Casualty_class')

In [None]:
plt.figure(figsize=[25,15])
sns.heatmap(df.corr(),annot=True)

Not much correlation between varibles 

In [None]:
bins = ['0','0-5','6-10','11-15','16-20','21-25','26-35',
           '36-45', '46-55','56-65','66-75','75+']

plt.figure(figsize=(12,6))
train.Age_band_of_casualty.hist(bins = 11,alpha=0.5,rwidth=0.90, color= 'red',)
plt.title('Age of casualty in accidents', fontsize = 25)
plt.grid(False)
y_pos = np.arange(len(bins))
plt.xticks(y_pos , bins)
plt.ylabel('Accident #' , fontsize = 15)
plt.xlabel('Age of casualty', fontsize = 15,)

In [None]:
train.columns

In [None]:
grid = sns.FacetGrid(data=train, col='Casualty_class', height=4, aspect=1, sharey=False)
# mapping bar plot and the data on to the grid
grid.map(sns.countplot, 'Light_conditions', palette=['black', 'brown', 'orange'])
plt.show()

1 = dark and no lighting, 2 = dark with lighting, 3 = daytime

a majority of accidents happen in the daytime and at night with lighting.

In [None]:
target_count = train['Casualty_class'].value_counts()

target_count.plot(kind='bar', title='Count (target)');

In [None]:
train.columns

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df, y='Number_of_casualties', x='Residential areas')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.pie(x=train['Casualty_class'].value_counts().values,
        labels=train['Casualty_class'].value_counts().index,
        autopct='%2.2f%%')
plt.show()

#### Testing 

In [None]:
a=train.Casualty_class
b=train.Light_conditions
observed = pd.crosstab(a,b)
observed

Ho - light conditions effects casualty class

Ha - light conditions do not effect it 

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
alpha = 0.05
print(f'chi2 = {chi2:.2f}')
print(f'p value: {p:.4f}')
if p < alpha:
      print('We can reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

Ho - Driving experiance effects casualty class

Ha - Driving experiance does not effect it

In [None]:
a2=train.Casualty_class
b2=train.Driving_experience
observed2 = pd.crosstab(a2,b2)
observed2

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed2)
alpha = 0.05
print(f'chi2 = {chi2:.2f}')
print(f'p value: {p:.4f}')
if p < alpha:
      print('We can reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

In [None]:
df1 = train[train.isna().any(axis=1)]
df1

In [8]:
# Chi-square feature selection
def feature_chi2(X_train, X_validate, X_test, k = 5):
     
    # Feature selection
    fs = SelectKBest(score_func = chi2, k = k)
    fs.fit(X_train, y_train)
    
    # Selected columns
    cols = fs.get_support(indices = True)
    
    # Output data
    X_train_fs = X_train.iloc[:, cols]
    X_validate_fs = X_validate.iloc[:, cols]
    X_test_fs = X_test.iloc[:, cols]
    
    return X_train_fs, X_validate_fs, X_test_fs


In [9]:
X_train_fs, X_validate_fs, X_test_fs = feature_chi2(X_train, X_validate, X_test, k = 5)

In [10]:
X_train_fs.head()

Unnamed: 0,Area_accident_occured_ Industrial areas,Age_band_of_casualty_31-50,Age_band_of_casualty_Over 51,Age_band_of_casualty_Under 18,Sex_of_casualty_Male
10242,0,0,0,0,0
2673,0,0,0,0,0
5201,0,0,0,0,0
11438,0,1,0,0,0
11556,0,0,0,0,1


In [None]:
train.head()

In [None]:
train.head()