# Predictive Model

#### Packages

In [None]:
# file manipualtion
from pathlib import Path
import numpy as np
import pandas as pd
import pickle

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

# balancing dataset
from imblearn.over_sampling import SMOTE

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

# visualistaion
import matplotlib.pyplot as plt

# warning supression
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

#### Directory  creation and file management

In [2]:
# defining the directory to original data
cwd = Path('./')
root_dir = cwd.resolve().parent

original_data_dir = root_dir / 'data'
additional_data_dir = root_dir / 'additional_data'

# list the .csv files for the project
for file in original_data_dir.glob('*.csv'):
    print(file)

/home/jake/Documents/TrafficAccidents/data/vehicles2019.csv
/home/jake/Documents/TrafficAccidents/data/accidents2019.csv
/home/jake/Documents/TrafficAccidents/data/casualties2019.csv


#### Testing classification models

In [3]:
def get_models():
    models = {}
    models['decision_tree'] = DecisionTreeClassifier()
    models['knn'] = KNeighborsClassifier()
    models['logistic_regression'] = LogisticRegression()
    models['bayes'] = MultinomialNB()
    models['rf'] = RandomForestClassifier()
    return models

models = get_models()

models

{'decision_tree': DecisionTreeClassifier(),
 'knn': KNeighborsClassifier(),
 'logistic_regression': LogisticRegression(),
 'bayes': MultinomialNB(),
 'rf': RandomForestClassifier()}

In [4]:
# import cleaned accidents data frame from the pickle file
accidents = pd.read_pickle(additional_data_dir / 'accidents_cleaned.pkl')
vehicles = pd.read_csv(original_data_dir / 'vehicles2019.csv')
casualties = pd.read_csv(original_data_dir / 'casualties2019.csv')

# convert column names to lowercase for ease of indexing
def lower_columns(df):
    """
    Defintion:
        convert column names to lower case
    """
    df.columns = map(str.lower, df.columns)
    
# converting all column names to lower case
lower_columns(vehicles)
lower_columns(casualties)

accidents = pd.merge(accidents, vehicles, on='accident_index')
accidents = pd.merge(accidents, casualties, on='accident_index')
accidents.columns

  vehicles = pd.read_csv(original_data_dir / 'vehicles2019.csv')
  casualties = pd.read_csv(original_data_dir / 'casualties2019.csv')


Index(['accident_index', 'longitude', 'latitude', 'police_force',
       'accident_severity', 'number_of_vehicles', 'number_of_casualties',
       'day_of_week', 'local_authority_(district)',
       'local_authority_(highway)', '1st_road_class', '1st_road_number',
       'road_type', 'speed_limit', 'junction_detail', 'junction_control',
       '2nd_road_class', '2nd_road_number',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards',
       'urban_or_rural_area', 'did_police_officer_attend_scene_of_accident',
       'lsoa_of_accident_location', 'district', 'converted_date',
       'converted_time', 'datetime', 'decimal_time', 'day_of_year',
       'vehicle_reference_x', 'vehicle_type', 'towing_and_articulation',
       'vehicle_manoeuvre', 'vehicle_location-restricted_lane',
       'junction_location', 'skidding_a

# Feature Selection

### Test for columns with many values less than 0

Any column with more than 10% of values being less than 0 is dropped.

In [5]:
# exclude columns with non-numerical data types
test = accidents.select_dtypes(exclude=['object', 'datetime64'])

ratios = ((test < 0).sum() / test.sum()).to_dict()

to_delete = []
for k, v in ratios.items():
    if v > 0.10:
        to_delete.append(k)

print(to_delete)

['junction_control', '2nd_road_class', 'towing_and_articulation', 'propulsion_code', 'driver_home_area_type', 'casualty_home_area_type']


In [6]:
# manually determine data type of features
numeric = ['number_of_vehicles', 'number_of_casualties', 'age_of_driver',
           'engine_capacity_(cc)', 'age_of_vehicle', 'age_of_casualty']

binary = ['was_vehicle_left_hand_drive?', 'sex_of_driver', 'sex_of_casualty']

nominal = ['1st_road_class', 'road_type', 'junction_detail', 'light_conditions',
           'weather_conditions', 'road_surface_conditions',
           'urban_or_rural_area',
           'vehicle_type', 'vehicle_manoeuvre',
           'junction_location', 'journey_purpose_of_driver', 'casualty_type']

ordinal = ['speed_limit', 'day_of_week', 'day_of_year', 'age_band_of_driver', 'age_band_of_casualty']

target = 'accident_severity'

In [7]:
class FeatureSelection:

    def __init__(self, df, cat_features, numeric_features, target):
        self.df = df
        self.cat_features = cat_features
        self.numeric_features = numeric_features
        self.target = target

    def remove_negatives(self):
        features = self.numeric_features + self.cat_features
        filtered = self.df.filter(features + [self.target], axis=1)
        filtered = filtered[(filtered > 0).all(1)]
        return filtered

    def get_feature_scores(self, dtype='cat', k='all'):
        filtered_dataset = self.remove_negatives()
        if dtype == 'cat':
            features = self.cat_features
            selector = SelectKBest(f_classif, k=k)
        if dtype == 'num':
            features = self.numeric_features
            selector = SelectKBest(f_classif, k=k)
        selector.fit(filtered_dataset[features], filtered_dataset[self.target])

        cols = selector.get_support(indices=True)
        p_values = selector.pvalues_

        p_values = pd.Series(p_values, dtype='object')
        scores = -np.log10(selector.pvalues_)
        return cols, scores

    def plot_features(self):
        cat_features, cat_scores = self.get_feature_scores(dtype='cat', k='all')
        num_features, num_scores = self.get_feature_scores(dtype='num', k='all')
        features = cat_features + num_features
        scores = list(cat_scores) + list(num_scores)
        plt.bar(range(len(features)), scores)
        plt.xticks(range(len(features)), features, rotation='vertical')
        plt.show()
    
    def get_new_X(self):
        filtered_dataset = self.remove_negatives()
        cat, _ = self.get_feature_scores(dtype='cat', k=12)
        numeric, _ = self.get_feature_scores(dtype='num', k=2)
        target = self.target
        cols = np.concatenate((cat, numeric))
        target_i = filtered_dataset.columns.get_loc(self.target)
        cols = np.append(cols, target_i)
        new_df = filtered_dataset.iloc[:, cols]
        
        return new_df

fs = FeatureSelection(accidents, nominal+ordinal+binary, numeric, target)

filtered = fs.get_new_X()


# Constructing the model

We see that the model is heavily imbalanced towards accident_severity label 3. 

Hence, SMOTE oversampling should be used to balance the training data for more accurate training.

In [8]:
filtered.accident_severity.value_counts()

3    31739
2     7121
1      602
Name: accident_severity, dtype: int64

In [9]:
X_data = filtered.iloc[:, :-1]
y_data = filtered.iloc[:, -1]

# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25)

# data set shapes
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (29596, 14)
y_train shape: (29596,)
X_test shape: (9866, 14)
y_test shape: (9866,)


In [10]:
print("Shapes prior to over-sampling: \n")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

print("\n\nShapes after over-sampling: \n")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

Shapes prior to over-sampling: 

X_train shape: (29596, 14)
y_train shape: (29596,)


Shapes after over-sampling: 

X_train shape: (71460, 14)
y_train shape: (71460,)


In [11]:
print("Label ratio after over-sampling")
y_train.value_counts()

Label ratio after over-sampling


3    23820
2    23820
1    23820
Name: accident_severity, dtype: int64

#### Evaluate model

Here we use a repeated stratified k-fold cross-validation method, so that the training data is split into k folds with equal proportions per label. Each k-1 folds are evaluated using the final fold as validation, and the process is repeated three times, hence repeated stratified k-fold.

Defining a stacked model and appending to the list of individual models

In [12]:
def get_stacking():
    level0 = []
    for k, v in models.items():
        level0.append((k, v))
    level1 = LogisticRegression()
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

models['stack'] = get_stacking()

get_stacking()

StackingClassifier(cv=5,
                   estimators=[('decision_tree', DecisionTreeClassifier()),
                               ('knn', KNeighborsClassifier()),
                               ('logistic_regression', LogisticRegression()),
                               ('bayes', MultinomialNB()),
                               ('rf', RandomForestClassifier()),
                               ('stack',
                                StackingClassifier(cv=5,
                                                   estimators=[('decision_tree',
                                                                DecisionTreeClassifier()),
                                                               ('knn',
                                                                KNeighborsClassifier()),
                                                               ('logistic_regression',
                                                                LogisticRegression()),
                             

### Training all models in the models dictionary

In [13]:
models

{'decision_tree': DecisionTreeClassifier(),
 'knn': KNeighborsClassifier(),
 'logistic_regression': LogisticRegression(),
 'bayes': MultinomialNB(),
 'rf': RandomForestClassifier(),
 'stack': StackingClassifier(cv=5,
                    estimators=[('decision_tree', DecisionTreeClassifier()),
                                ('knn', KNeighborsClassifier()),
                                ('logistic_regression', LogisticRegression()),
                                ('bayes', MultinomialNB()),
                                ('rf', RandomForestClassifier())],
                    final_estimator=LogisticRegression())}

In [14]:
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10,
                                 n_repeats=3,
                                 random_state=1)
    scores = cross_val_score(model, X, y,
                             scoring='accuracy',
                             cv=cv,
                             n_jobs=-1,
                             error_score='raise')
    return scores

In [15]:
results, names, summary_results = [], [], []
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    summary_results.append({name: {'mean': np.mean(scores), 'std': np.std(scores)}})
    print(f"{name} {np.mean(scores):.4f}, {np.std(scores):.4f}")

decision_tree 0.8986, 0.0036
knn 0.8783, 0.0047
logistic_regression 0.4882, 0.0059
bayes 0.3960, 0.0067
rf 0.9374, 0.0028


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

stack 0.9519, 0.0029


A stacked model has reached the highest cross-validated score, reaching 95.2%

## Fitting the model

In [16]:
stacked_model = models['rf']

stacked_model.fit(X_train, y_train)

StackingClassifier(cv=5,
                   estimators=[('decision_tree', DecisionTreeClassifier()),
                               ('knn', KNeighborsClassifier()),
                               ('logistic_regression', LogisticRegression()),
                               ('bayes', MultinomialNB()),
                               ('rf', RandomForestClassifier())],
                   final_estimator=LogisticRegression())

In [17]:
stacked_model.score(X_test, y_test)

0.8702615041556861

## Saving model for later use

In [23]:
filename = 'prediction_model.pkl'
pickle.dump(stacked_model, open(filename, 'wb'))

Reopening model using pickle package.

In [24]:
with open('prediction_model.pkl', 'rb') as model:
    prediction_model = pickle.load(model)