### Introduction
####  Machine Learning model to predict Accident Risk Index by using area/district level accident data.

### Import the necessary packages.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
%matplotlib inline
import warnings
import re
warnings.filterwarnings("ignore")

### Import train and test dataset

In [None]:
train=pd.read_csv("../input/predict-accident-risk-score-for-unique-postcode/train.csv")
test=pd.read_csv("../input/predict-accident-risk-score-for-unique-postcode/test.csv")

### conver the column names to lowercase and replace hyphrn(-) to underscore(_).

In [None]:
train=train.rename(columns={col:col.replace("-","_").lower() for col in train.columns.values})
test=test.rename(columns={col:col.replace("-","_").lower() for col in test.columns.values})

### At postcode wise total casualties from the road accidents divided by the count of accidents, we can get the target column **accident_risk_index**. 

In [None]:
def ari(df):
  df['total_casualties']=df.groupby('postcode')['number_of_casualties'].transform('sum')
  df['accident_count']=df.groupby('postcode')['accident_id'].transform('count')
  df['accident_risk_index']=round(df['total_casualties']/df['accident_count'],2)
  return df



train=ari(train)


### Let's pre-process the data.

### The postcode divided into four parts for instance **OX3 9AP**, the **OX** is area, **3** is district, **9** is sector, and **UP** is unit(street, property, organization).

### In this data the postcode length is in various size.For instance,

* DL145 8BG ->(length:9)
* BN21 2XR ->(length:8)
* OX3 9UP ->(length:7)
* E5 9QH ->(length:6)
* CB1 4 ->(length:5)
* PE21 ->(length:4)
* GU9 ->(length:3)
* S8 ->(length:2)

### Let's use the regex function to retrieve the four parts in postcode.

### Date column feature engineering

In [None]:
def pre_process(df):
   
    ###  create missing value indicator for missing columns
  
    for i in df.columns[df.isnull().any()]:
        df[f"{i}_isnull"]=df[i].isnull().astype('int')
    
    
    ### fill na rows with random value.

    df['time']=df['time'].fillna('16:00')
    df['road_surface_conditions']=df['road_surface_conditions'].fillna('Dry')
    df['special_conditions_at_site']=df['special_conditions_at_site'].fillna('None')
    
    ### Extract the string highway number
    
    df['local_authority_(highway)_cat']=df['local_authority_(highway)'].str.extract("([A-Z]+)")
    #df['local_authority_(highway)_num']=df['local_authority_(highway)'].str.extract("([0-9]+)")
    #df['local_authority_(highway)_num']=df['local_authority_(highway)_num'].fillna(0)
    #df['local_authority_(highway)_num']=df['local_authority_(highway)_num'].astype('int')
    
    ### Split the postcode
    
    def postcode_split(df):
        df['postcode']=df['postcode'].str.strip()
        df['area']=[re.search(r"([A-Z]+)",i).group(1) for i in df['postcode']]
        df['district']=[re.search(r"[A-Z]+([0-9]+)",i).group(1) if re.search(r"[A-Z]+([0-9]+)",i) else "none" for i in df['postcode']]
        df['sector']=[re.search(r"(\s[0-9]+)",i).group(1) if re.search(r"(\s[0-9]+?)",i) else "none" for i in df['postcode']]
        df['unit']=[re.search(r"\s[0-9]+([A-Z]+)",i).group(1) if re.search(r"\s[0-9]+?([A-Z]+)",i) else "none" for i in df['postcode']]
        return df

    df=postcode_split(df)
    
    ### Date feature engineering
    
    def date_pre_process(df):
        df['date_time']=pd.to_datetime(df["date"].astype('str'))
        #df[['Year', 'Month', 'Day', 'Hour', 'Minute']]=df[['Year', 'Month', 'Day', 'Hour', 'Minute']].astype('str')
        #df['date'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']],infer_datetime_format=True,errors='coerce').dt.tz_localize(None)
        df['day']=df['date_time'].dt.day
        df['day_label']=df['date_time'].dt.day_name()
        df['day_number']=df['date_time'].dt.dayofweek
        df['month_number']=df['date_time'].dt.month
        df['month_label']=df['date_time'].dt.strftime('%b')
        df['year_quarter']=df['date_time'].dt.quarter
        df['week_of_year']=df['date_time'].dt.week
        df['year']= df['date_time'].dt.year
        df['dayofmonth'] = df['date_time'].dt.daysinmonth
        df['dayofyear'] = df['date_time'].dt.day_of_year
        df['hour']=pd.to_datetime(df["time"],errors='coerce').dt.hour
        df['minute']=pd.to_datetime(df["time"],errors='coerce').dt.minute
        return df

    df=date_pre_process(df)
    
    ### covert months to seasons
    
    def month2seasons(x):
        season=""
        if x in [12, 1, 2]:
            season = 'Winter'
        elif x in [3, 4, 5]:
            season = 'Spring'
        elif x in [6, 7, 8]:
            season = 'Summer'
        elif x in [9,10, 11]:
            season = 'Autumn'
        return season

    df['seasons']=df['month_number'].apply(month2seasons)
    
    ### create features weekend, weekstart, monthstart, monthend.
    
    df['weekend']=df['day_label'].isin(['Saturday','Sunday']).astype('int')
    df['weekstart']=df['day_label'].isin(['Monday']).astype('int')
    df['is_month_start']=(df['date_time'].dt.is_month_start).astype('int')
    df['is_month_end']=(df['date_time'].dt.is_month_end).astype('int')
    
    #### Convert hours to time od the day.
    
    def hours2timing(x):
        if x in range(20,23):
            timing = 'Night'
        elif x in range(5,12):
            timing = 'Morning'
        elif x in range(12, 16):
            timing = 'Afternoon'
        elif x in range(16, 20):
            timing = 'Evening'
        elif x in [23,0,1,2,3,4]:
            timing = 'Midnight'    
        else:
            timing = 'X'
        return timing



    df['timings']=df['hour'].apply(hours2timing) 
    
    #df['grp']=df[ 'state']+'_'+df['area']+'_'+df['district']+'_'+df['sector']+'_'+df['unit']+'_'+df['day_number'].astype('str')+'_'+df['month_number'].astype('str')

    ### Numerical columns feature engineering
    
    grpcol=[ 'state', 'area',
    'district', 'sector', 'unit','day_number','month_number']

    valcols=['accident_count','police_force']

    def num_feat_eng(df):
        for col1 in valcols:
            df1=(df.groupby(grpcol)[col1].
            agg({'min','median','mean','max'}).reset_index())
            df1=df1.rename(columns={c:col1+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|mean|median|max)")]})
            df=pd.merge(df,df1,on=grpcol,how='left')
        return df

    df=num_feat_eng(df)    

    return df

### Apply the pre-process steps to train data.

In [None]:
train=pre_process(train)

### Import Necessary Packages

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures

### Split predictor and target variables.

In [None]:
for i,col in enumerate(train.columns):
  print(f"{i}_{col}")


In [None]:
X=train.iloc[:,np.r_[1:3,7:25,28,30:38,39,41,42,44,45,47:65]]
y=train['accident_risk_index']

### Create label encoder

In [None]:
lbl=LabelEncoder()

### Encode the categorical columns using label encoder.

In [None]:
for i in X.select_dtypes(include='object').columns.values:
    X[i]=lbl.fit_transform(X[[i]])

### Install pycaret

In [None]:
!pip install -q pycaret[full]
!pip install imbalanced-learn==0.7.0

### Import regression from pycaret

In [None]:
from pycaret.regression import *

### Create a new data frame 

In [None]:
train2=pd.concat([X,y],axis=1)

### Setup the data for model

In [None]:
ari_reg1 = setup(data = train2,
                   target = 'accident_risk_index', session_id=123,
                   #use_gpu =Tru
                 
                   fold=5,
                   
                   
             data_split_shuffle = True, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'day_number',
          
        
    
            n_jobs = -1
                 
                  ) 

### add the custome evaluation metric.

In [None]:
add_metric('mean_squared_error','mean_squared_error',mean_squared_error,squared=False)

### Compare the more than one regression models and get the top 3 best model.

In [None]:
top3 = compare_models(n_select=3,
                      exclude = ['lar',  'rf', 'et', 'gbr', 'xgboost']
                     )

### Blend the top 3 model.

In [None]:
blender = blend_models(top3)

### Finalize the model for prediction.

In [None]:
final = finalize_model(blender)

### Residual PLot

In [None]:
plot_model(final, plot = 'residuals')

### Prediction Error Plot

In [None]:
#Pycaret Voting Regressor
plot_model(final, plot = 'error')

### Feature Importance Plot

In [None]:
plot_model(final.named_estimators_['catboost'], plot = 'feature_all')

### Save the model

In [None]:
save_model(final,'final_blend_pycaret')

### Import shap explainer

In [None]:
import shap

### Load the saved model

In [None]:
saved_model=load_model('./final_blend_pycaret')

### Transform the data by using saved model pre-process pipeline

In [None]:
train_pipe = saved_model[:-1].transform(train2)

### By using shap tree explainer get the shap values.

In [None]:
explainer = shap.TreeExplainer(saved_model.named_steps["trained_model"].named_estimators_['catboost'])
shap_values = explainer.shap_values(train_pipe)

In [None]:
shap.initjs()

### SHAP Feature Importance

In [None]:
shap.summary_plot(shap_values, train_pipe,plot_type='bar')

### Which feature impact the model

In [None]:
shap.summary_plot(shap_values, train_pipe)

### Feature impact on single observation

In [None]:
idx = 10
shap.force_plot(explainer.expected_value, shap_values[idx,:], 
train_pipe.iloc[idx,:],matplotlib=True)

### Apply the pre-process steps to test data.

In [None]:
test['accident_count']=test.groupby('postcode')['accident_id'].transform('count')

In [None]:
test=pre_process(test)

### Prediction using final model

In [None]:
pred=(predict_model(final, data=test.loc[:,X.columns])['Label']).round()

### Result dataframe.

In [None]:
res=pd.DataFrame({'postcode':test['postcode'],'Accident_risk_index':pred})

### Accident_risk_index meant at group by postcode.

In [None]:
res1=res.groupby('postcode').agg({'Accident_risk_index':'mean'}).reset_index()

### Write the cav file

In [None]:
res1.to_csv('Pycaret_Blend.csv',index=False)