In [1]:
import pandas as pd
import prepare
import explore

import numpy as np
import seaborn as sns
import scipy.stats as stats
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('texasma.csv')
df.head(2)

Unnamed: 0,Crash ID,Average Daily Traffic Amount,Average Daily Traffic Year,City,County,Crash Date,Crash Death Count,Crash Month,Crash Severity,Crash Time,...,Person Blood Alcohol Content Test Result,Person Death Count,Person Drug Test Result,Person Ethnicity,Person Gender,Person Helmet,Person Injury Severity,Person Not Injured Count,Person Total Injury Count,Person Type
0,11825679,21549,2018,SAN ANTONIO,BEXAR,1/1/11,0,1,B - SUSPECTED MINOR INJURY,2230,...,No Data,0,97 - NOT APPLICABLE,H - HISPANIC,1 - MALE,1 - NOT WORN,B - SUSPECTED MINOR INJURY,0,1,5 - DRIVER OF MOTORCYCLE TYPE VEHICLE
1,11825679,21549,2018,SAN ANTONIO,BEXAR,1/1/11,0,1,B - SUSPECTED MINOR INJURY,2230,...,No Data,0,No Data,H - HISPANIC,2 - FEMALE,1 - NOT WORN,C - POSSIBLE INJURY,0,1,6 - PASSENGER/OCCUPANT ON MOTORCYCLE TYPE VEHICLE


In [3]:
df = prepare.cleaned(df)
df.head()

Unnamed: 0_level_0,city,county,deceased,injured,day,latitude,longitude,age,driver,helmet,male
crash_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 22:30:00,SAN ANTONIO,BEXAR,0,1,SATURDAY,29.358915,-98.566314,27,0,0,1
2011-01-01 22:30:00,SAN ANTONIO,BEXAR,0,1,SATURDAY,29.358915,-98.566314,28,1,0,0
2011-01-03 12:15:00,SAN ANTONIO,BEXAR,1,0,MONDAY,29.537465,-98.424572,53,0,0,1
2011-01-05 21:15:00,SAN ANTONIO,BEXAR,0,1,WEDNESDAY,29.513015,-98.535152,47,0,0,1
2011-01-06 12:10:00,AUSTIN,WILLIAMSON,0,1,THURSDAY,30.476422,-97.76481,19,0,1,1


In [None]:
df.info()

##### Data Visualization

In [4]:
#initial hypothesis test and model was done on this split.
train, validate, test = prepare.train_validate_test_split(df, 'deceased', seed=123)

In [None]:
### Classification
# Whether or not a person survives (helmet, driver)

### Clustering
# Which age group die in an accident (age)
# where are most accidents occuring (lat/long)

### Timeseries
# predict the number of motorcycle accidents over the next two years (timeseries)
# How likely are you to be killed in a motorcycle accident (injured)

### Univariate Stats

In [None]:
cat_vars = ['injured', 'driver', 'helmet', 'male']
quant_vars = ['age']

In [None]:
explore.explore_univariate(train, cat_vars, quant_vars)

### Bivariate Stats

In [None]:
explore.explore_bivariate(train, 'deceased', cat_vars, quant_vars)

### Multivariate

In [None]:
explore.explore_multivariate(train, 'deceased', cat_vars, quant_vars)

In [5]:
X_train = train[['driver', 'helmet']]
y_train = train.deceased
X_validate = validate[['driver', 'helmet']]
y_validate = validate.deceased
X_test = test[['driver', 'helmet']]
y_test = test.deceased

In [None]:
#print of x, y variables shape.
print(X_train.shape,y_train.shape)
print(X_validate.shape,y_validate.shape)
print(X_test.shape,y_test.shape)

## Baseline

In [6]:
baseline = y_train.mode()
matches_baseline_prediction = y_train == 0
baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.96


# Model

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [8]:
#prediction, accuracy and class report evaluation function used for the above functions
def get_metrics_bin(clf, X, y):
    '''
    get_metrics_bin will take in a sklearn classifier model, an X and a y variable and utilize
    the model to make a prediction and then gather accuracy, class report evaluations
    Credit to @madeleine-capper
    return:  a classification report as a pandas DataFrame
    '''
    y_pred = clf.predict(X)
    accuracy = clf.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report 

In [17]:
### Decision Tree
#Create the model
dt = DecisionTreeClassifier(max_depth=3, random_state=123)
#Fit the model
dt.fit(X_train, y_train)
#Use the model
dt.predict(X_train)
#prediction, accuracy, & class report evaluations
get_metrics_bin(dt, X_train, y_train)


    The accuracy for our model is 0.9617
    The True Positive Rate is 0.0, The False Positive Rate is 0.0,
    The True Negative Rate is 1.0, and the False Negative Rate is 1.0
    


Unnamed: 0,precision,recall,f1-score,support
0,0.961736,1.0,0.980495,4876.0
1,0.0,0.0,0.0,194.0
accuracy,0.961736,0.961736,0.961736,0.961736
macro avg,0.480868,0.5,0.490247,5070.0
weighted avg,0.924936,0.961736,0.942977,5070.0


In [18]:
get_metrics_bin(dt, X_validate, y_validate)


    The accuracy for our model is 0.9618
    The True Positive Rate is 0.0, The False Positive Rate is 0.0,
    The True Negative Rate is 1.0, and the False Negative Rate is 1.0
    


Unnamed: 0,precision,recall,f1-score,support
0,0.961822,1.0,0.980539,2091.0
1,0.0,0.0,0.0,83.0
accuracy,0.961822,0.961822,0.961822,0.961822
macro avg,0.480911,0.5,0.49027,2174.0
weighted avg,0.925101,0.961822,0.943104,2174.0


In [19]:
get_metrics_bin(dt, X_test, y_test)


    The accuracy for our model is 0.9619
    The True Positive Rate is 0.0, The False Positive Rate is 0.0,
    The True Negative Rate is 1.0, and the False Negative Rate is 1.0
    


Unnamed: 0,precision,recall,f1-score,support
0,0.9619,1.0,0.98058,1742.0
1,0.0,0.0,0.0,69.0
accuracy,0.9619,0.9619,0.9619,0.9619
macro avg,0.48095,0.5,0.49029,1811.0
weighted avg,0.925251,0.9619,0.943219,1811.0


In [None]:
df = df.drop(columns=['Crash ID', 'Average Daily Traffic Amount', 'Average Daily Traffic Year', 'Highway Number',
                     'Surface Condition', 'Surface Type', 'Vehicle Hit and Run Flag',
                     'Person Blood Alcohol Content Test Result', 'Person Drug Test Result',
                     'Crash Month', 'Crash Year', 'Number of Lanes', 'License Plate State',
                     'Driver License Type', 'Crash Severity', 'Unit Death Count', 'Unit Not Injured Count',
                     'Person Death Count', 'Person Injury Severity', 'Person Not Injured Count', 'Person Total Injury Count'])

In [None]:
df = df.rename(str.lower, axis='columns')

In [None]:
df = df.rename(columns = {'crash date':'date', 'day of week':'day',
                         'weather condition':'weather', 'vehicle color':'color', 'vehicle make':'make',
                         'person age':'age', 'person ethnicity':'ethnicity', 'crash death count':'deceased', 'crash time':'time',
                         'crash total injury count':'injured', 'driver license state':'dl_state', 'person gender':'gender',
                         'person helmet':'helmet', 'person type':'driver'})

In [None]:
df = df.replace(to_replace = {'5 - DRIVER OF MOTORCYCLE TYPE VEHICLE', '6 - PASSENGER/OCCUPANT ON MOTORCYCLE TYPE VEHICLE'},
                value = {'driver', 'passenger'})

In [None]:
df = df.replace(to_replace = {'1 - NOT WORN', '99 - UNKNOWN IF WORN'}, value = 'not worn')

In [None]:
df = df.replace(to_replace = {'2 - WORN, DAMAGED', '3 - WORN, NOT DAMAGED', '4 - WORN, UNK DAMAGE'}, value = 'worn')

In [None]:
df = df.replace(to_replace = {'2 - FEMALE'}, value = 'female')

In [None]:
df = df.replace(to_replace = {'1 - MALE'}, value = 'male')

In [None]:
df['gender'] = df['gender'].replace({'99 - UNKNOWN':'male', 'No Data':'male'})

In [None]:
df['ethnicity'] = df['ethnicity'].replace({'No Data':'98 - OTHER'})

In [None]:
df['age'] = df['age'].replace({'No Data':37})

In [None]:
df['make'] = df['make'].replace({'No Data':'unknown', 'UNKNOWN':'unknown'})

In [None]:
df['color'] = df['color'].replace({'No Data':'99 - UNKNOWN'})

In [None]:
df['dl_state'] = df['dl_state'].replace({'No Data':'UN - UNKNOWN'})

In [None]:
df['injured'] = df['injured'].replace({2:1, 3:1, 4:1, 5:1, 6:1, 7:1})

In [None]:
df['deceased'] = df['deceased'].replace({2:0})

In [None]:
df['latitude'] = df['latitude'].replace({'No Data':0})

In [None]:
df['longitude'] = df['longitude'].replace({'No Data':0})

In [None]:
df['weather'] = df['weather'].str[4:]

In [None]:
df['weather'] = df['weather'].str.strip()

In [None]:
df['dl_state'] = df['dl_state'].str[5:]

In [None]:
df['color'] = df['color'].str[6:]

In [None]:
df['ethnicity'] = df['ethnicity'].str[4:]

In [None]:
df['ethnicity'] = df['ethnicity'].str.strip()

In [None]:
df.time = df.time.astype(str)
df['time'] = df['time'].apply(lambda x: x.zfill(4))
df.time = df.time.str[:2] + ':' + df.time.str[-2:]

In [None]:
df['crash_date'] = df['date'] +' '+ df['time']

In [None]:
df['crash_date'] = pd.to_datetime(df['crash_date'])

In [None]:
df = df.set_index('crash_date').sort_index()

In [None]:
convert_dict_int = {'age': int, 'deceased':int, 'injured':int, 'latitude':float, 'longitude':float}
df = df.astype(convert_dict_int)

In [None]:
#get_dummies creates a seperate df of booleans for the identified columns below. Cleaning for the decission tree.
dummy_df = pd.get_dummies(df[['driver', 'helmet', 'gender']], dummy_na=False, drop_first=[True, True])

In [None]:
df = df.drop(columns=['driver', 'helmet', 'gender', 'date', 'time'])

In [None]:
df = pd.concat([df, dummy_df], axis=1)

In [None]:
df = df.rename(columns = {'driver_passenger':'driver', 'helmet_worn':'helmet', 'gender_male':'male'})

In [None]:
convert_dict_int = {'driver': int, 'helmet':int, 'male':int}
df = df.astype(convert_dict_int)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
for c in df.columns:
    print ("---- %s ---" % c)
    print (df[c].value_counts())

In [None]:
df1 = df[['city', 'county', 'latitude', 'longitude']]
df1.head()

In [None]:
temp_df = df.loc[(df['latitude'] == 'No Data')]
temp_df