import all needed packages

In [26]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.linear_model as linear_model
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from datetime import datetime

import dataset and clean

In [27]:
data_path = 'E:/Research/CE/ML/newdata.xlsx'
#read data from csv file
original_data = pd.read_excel(data_path)
#remove all parentheses, replace all spaces by underscore
original_data.columns = original_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
#remove all rows where crash_severity (target) is unknown
original_data = original_data[original_data.crash_severity != '99 - UNKNOWN']
#remove some NA rows
original_data = original_data[original_data.adjusted_average_daily_traffic_amount != 'No Data']
#display column titles
original_data.columns 

Index(['crash_id', 'active_school_zone_flag',
       'adjusted_average_daily_traffic_amount',
       'adjusted_percentage_of_average_daily_traffic_for_trucks',
       'at_intersection_flag', 'commercial_motor_vehicle_flag',
       'construction_zone_flag', 'county', 'crash_severity', 'crash_time',
       'crash_year', 'curve_degrees', 'curve_length', 'curve_type',
       'day_of_week', 'first_harmful_event', 'highway_lane_design',
       'inside_shoulder_width_on_divided_highway', 'intersection_related',
       'latitude', 'left_shoulder_type', 'left_shoulder_use',
       'left_shoulder_width', 'light_condition', 'longitude',
       'manner_of_collision', 'median_type', 'median_width', 'number_of_lanes',
       'object_struck', 'right_shoulder_type', 'right_shoulder_use',
       'right_shoulder_width', 'road_class', 'roadway_alignment',
       'roadway_function', 'roadway_part', 'roadway_type', 'rural_flag',
       'rural_urban_type', 'speed_limit', 'surface_type', 'surface_width',
   

In [28]:
#continue to remove some NA rows
original_data = original_data[original_data.rural_urban_type != 'No Data']
#see general information about the dataset
print(original_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8859 entries, 0 to 13947
Data columns (total 58 columns):
crash_id                                                   8859 non-null int64
active_school_zone_flag                                    8859 non-null object
adjusted_average_daily_traffic_amount                      8859 non-null object
adjusted_percentage_of_average_daily_traffic_for_trucks    8859 non-null object
at_intersection_flag                                       8859 non-null bool
commercial_motor_vehicle_flag                              8859 non-null object
construction_zone_flag                                     8859 non-null object
county                                                     8859 non-null object
crash_severity                                             8859 non-null object
crash_time                                                 8859 non-null int64
crash_year                                                 8859 non-null int64
curve_degrees    

In [29]:
#some numerical features were read as object, convert it back to numerical
original_data['adjusted_average_daily_traffic_amount']=original_data['adjusted_average_daily_traffic_amount'].apply(pd.to_numeric, errors='coerce')
original_data['adjusted_percentage_of_average_daily_traffic_for_trucks']=original_data['adjusted_percentage_of_average_daily_traffic_for_trucks'].apply(pd.to_numeric, errors='coerce')
original_data['longitude']=original_data['longitude'].apply(pd.to_numeric, errors='coerce')
original_data['latitude']=original_data['latitude'].apply(pd.to_numeric, errors='coerce')
original_data['median_width']=original_data['median_width'].apply(pd.to_numeric, errors='coerce')
original_data['number_of_lanes']=original_data['number_of_lanes'].apply(pd.to_numeric, errors='coerce')
original_data['right_shoulder_width']=original_data['right_shoulder_width'].apply(pd.to_numeric, errors='coerce')
#check results
print(original_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8859 entries, 0 to 13947
Data columns (total 58 columns):
crash_id                                                   8859 non-null int64
active_school_zone_flag                                    8859 non-null object
adjusted_average_daily_traffic_amount                      8859 non-null int64
adjusted_percentage_of_average_daily_traffic_for_trucks    8859 non-null float64
at_intersection_flag                                       8859 non-null bool
commercial_motor_vehicle_flag                              8859 non-null object
construction_zone_flag                                     8859 non-null object
county                                                     8859 non-null object
crash_severity                                             8859 non-null object
crash_time                                                 8859 non-null int64
crash_year                                                 8859 non-null int64
curve_degrees    

In [30]:
#all condidered features, removed some due to large portion of NA
crash_features = [ 'active_school_zone_flag',
       'adjusted_average_daily_traffic_amount',
       'adjusted_percentage_of_average_daily_traffic_for_trucks',
       'at_intersection_flag', 'commercial_motor_vehicle_flag',
       'construction_zone_flag', 'county', 'crash_severity', 'crash_time',
       'crash_year', 
       'day_of_week', 'first_harmful_event', 'highway_lane_design',
       'inside_shoulder_width_on_divided_highway', 'intersection_related',
       'latitude', 'left_shoulder_type', 'left_shoulder_use',
       'left_shoulder_width', 'light_condition', 'longitude',
       'manner_of_collision', 'median_type', 'median_width', 'number_of_lanes',
       'right_shoulder_type', 'right_shoulder_use',
       'right_shoulder_width', 'road_class', 'roadway_alignment',
       'roadway_function', 'roadway_part', 'roadway_type', 
       'speed_limit', 'surface_type', 
       'traffic_control_type', 'weather_condition', 'contributing_factor_1',
       'driver_license_class', 'driver_license_type', 'vehicle_body_style',
       'vehicle_travel_direction', 'person_age', 'person_ethnicity',
       'person_gender', 'person_restraint_used']

X = original_data[crash_features]
X.head() #display top few rows 

Unnamed: 0,active_school_zone_flag,adjusted_average_daily_traffic_amount,adjusted_percentage_of_average_daily_traffic_for_trucks,at_intersection_flag,commercial_motor_vehicle_flag,construction_zone_flag,county,crash_severity,crash_time,crash_year,...,weather_condition,contributing_factor_1,driver_license_class,driver_license_type,vehicle_body_style,vehicle_travel_direction,person_age,person_ethnicity,person_gender,person_restraint_used
0,No,23336,35.0,False,No,No,Callahan,N - NOT INJURED,929,2010,...,Rain,,Class C,Driver License,"Passenger Car, 2-Door",East,16,White,Female,Shoulder & Lap Belt
1,No,28778,31.4,False,Yes,No,Callahan,N - NOT INJURED,2025,2010,...,Cloudy,Failed To Control Speed,Class C,Driver License,"Passenger Car, 2-Door",West,19,White,Male,Shoulder & Lap Belt
2,No,19832,38.2,False,No,No,Callahan,B - NON-INCAPACITATING INJURY,1318,2010,...,Fog,Failed To Drive In Single Lane,Class C,Driver License,"Passenger Car, 4-Door",East,18,Black,Female,Shoulder & Lap Belt
3,No,28778,31.4,False,No,No,Callahan,N - NOT INJURED,850,2010,...,Snow,Other (Explain In Narrative),Class C,Driver License,Sport Utility Vehicle,West,20,White,Male,Shoulder & Lap Belt
4,No,23336,35.0,False,No,No,Callahan,N - NOT INJURED,1854,2010,...,Clear,,Class C,Driver License,"Passenger Car, 2-Door",West,18,Hispanic,Female,Shoulder & Lap Belt


In [31]:
features = X.copy() #make a copy
print(features.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8859 entries, 0 to 13947
Data columns (total 46 columns):
active_school_zone_flag                                    8859 non-null object
adjusted_average_daily_traffic_amount                      8859 non-null int64
adjusted_percentage_of_average_daily_traffic_for_trucks    8859 non-null float64
at_intersection_flag                                       8859 non-null bool
commercial_motor_vehicle_flag                              8859 non-null object
construction_zone_flag                                     8859 non-null object
county                                                     8859 non-null object
crash_severity                                             8859 non-null object
crash_time                                                 8859 non-null int64
crash_year                                                 8859 non-null int64
day_of_week                                                8859 non-null object
first_harmful_ev

In [32]:
#function for timer, return the running time
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

Use Label Encoder to encode categorical data

In [33]:
#label encode crash_severity (target) manually
cleanup = {"crash_severity": {"N - NOT INJURED": 0, "B - NON-INCAPACITATING INJURY": 1, "C - POSSIBLE INJURY": 2, 
                             "A - SUSPECTED SERIOUS INJURY": 3, "K - KILLED": 4}}
features.replace(cleanup, inplace=True)
target = features["crash_severity"] 
features.drop(["crash_severity"], inplace=True, axis=1)
feature_names = features.columns.tolist()
#divide columns into numerical/categorical
categorical_subset = features.select_dtypes("object")
#label encode other categorical features
label_fields = list(categorical_subset.columns)
label_encoder = preprocessing.LabelEncoder() 
features[label_fields] = label_encoder.fit_transform(label_fields)
features.head()

Unnamed: 0,active_school_zone_flag,adjusted_average_daily_traffic_amount,adjusted_percentage_of_average_daily_traffic_for_trucks,at_intersection_flag,commercial_motor_vehicle_flag,construction_zone_flag,county,crash_time,crash_year,day_of_week,...,weather_condition,contributing_factor_1,driver_license_class,driver_license_type,vehicle_body_style,vehicle_travel_direction,person_age,person_ethnicity,person_gender,person_restraint_used
0,0,23336,35.0,False,1,2,4,929,2010,5,...,32,3,6,7,30,31,16,18,19,20
1,0,28778,31.4,False,1,2,4,2025,2010,5,...,32,3,6,7,30,31,19,18,19,20
2,0,19832,38.2,False,1,2,4,1318,2010,5,...,32,3,6,7,30,31,18,18,19,20
3,0,28778,31.4,False,1,2,4,850,2010,5,...,32,3,6,7,30,31,20,18,19,20
4,0,23336,35.0,False,1,2,4,1854,2010,5,...,32,3,6,7,30,31,18,18,19,20


In [34]:
float_features = features.xs(feature_names,axis=1).values
#standardize the continuous features
scaler = StandardScaler()
float_scaled = scaler.fit_transform(float_features)
features[feature_names] = float_scaled
features.head()



Unnamed: 0,active_school_zone_flag,adjusted_average_daily_traffic_amount,adjusted_percentage_of_average_daily_traffic_for_trucks,at_intersection_flag,commercial_motor_vehicle_flag,construction_zone_flag,county,crash_time,crash_year,day_of_week,...,weather_condition,contributing_factor_1,driver_license_class,driver_license_type,vehicle_body_style,vehicle_travel_direction,person_age,person_ethnicity,person_gender,person_restraint_used
0,0.0,0.700426,1.64815,-0.624392,0.0,0.0,0.0,-0.765932,-1.665485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.68639,0.0,0.0,0.0
1,0.0,1.066361,1.338284,-0.624392,0.0,0.0,0.0,1.151681,-1.665485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.526138,0.0,0.0,0.0
2,0.0,0.464807,1.923587,-0.624392,0.0,0.0,0.0,-0.085319,-1.665485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.211371,0.0,0.0,0.0
3,0.0,1.066361,1.338284,-0.624392,0.0,0.0,0.0,-0.904154,-1.665485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.263648,0.0,0.0,0.0
4,0.0,0.700426,1.64815,-0.624392,0.0,0.0,0.0,0.852492,-1.665485,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.211371,0.0,0.0,0.0


In [35]:
y = target.values
X = features.values
#split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

train model

In [36]:
# train linear regression model
cls = linear_model.LinearRegression()
start_time = timer(None) 
cls.fit(X_train, y_train)
timer(start_time) 
y_pred = cls.predict(X_test)


 Time taken: 0 hours 0 minutes and 0.06 seconds.


In [37]:
#calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

#takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions and evalute
    model_pred = model.predict(X_test)
    model_mae = mae(y_test, model_pred)
    
    # Return the performance metric
    return model_mae


print(fit_and_evaluate(cls))

0.7904534432488997


In [38]:
start_time = timer(None)
#random forest method
random_forest = RandomForestRegressor(random_state=60)
fit_and_evaluate(random_forest)
timer(start_time)





 Time taken: 0 hours 0 minutes and 0.53 seconds.


In [39]:
print(fit_and_evaluate(random_forest))

0.7865462753950337
