In [1]:
# importing libraries pandas, matplotlib and seaborn to load dataframe and visualization of data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing “train_test_split” from “sklearn.model_selection”. 
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Importing tree, Pipeline, GridSearchCV from sklearn
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.metrics import make_scorer,f1_score, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [7]:
%matplotlib inline

In [8]:
# from google.colab import drive

In [9]:
# drive.mount('/content/drive')

In [10]:
%%time
# This is a countrywide car accident dataset, which covers 49 states of the USA. 
# The accident data are collected from February 2016 to June 2020, 
US_Accidents_Feb16_to_June20_data =pd.read_csv('../FinalProject/data/US_Accidents_June20.csv')
US_Accidents_Feb16_to_June20_data.shape

Wall time: 25.9 s


(3513617, 49)

###### County level demographics - Population distribution among different age groups at county level. 

In [120]:
# Demographic data for entire US
US_demographic_data =pd.read_csv('../FinalProject/data/cc-est2019-alldata.csv',encoding = "ISO-8859-1", engine='python')
US_demographic_data.shape

(716376, 80)

In [12]:
US_demographic_data['STNAME'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [13]:
# Function to filter dataframe by state
def filter_dataframe_by_state(df,state):
    updated_df = df[df['STNAME']==state]
    return updated_df

In [14]:
# States_records =['California']
# States_records =['Virginia']
# States_records =['Oklahoma']
# States_records =['Missouri']
# States_records =['Kansas']

In [15]:
US_demographic_data_by_state = filter_dataframe_by_state(US_demographic_data,'California')
US_demographic_data_by_state.shape

(13224, 80)

######  US Accidents Feb16 to June20_data-  by states. 


######   Concat US Accidents Feb16 to June20_data-  by states. 

In [16]:
US_Accidents_Feb16_to_June20_data.State.unique()

array(['OH', 'WV', 'CA', 'FL', 'GA', 'SC', 'NE', 'IA', 'IL', 'MO', 'WI',
       'IN', 'MI', 'NJ', 'NY', 'CT', 'MA', 'RI', 'NH', 'PA', 'KY', 'MD',
       'VA', 'DC', 'DE', 'TX', 'WA', 'OR', 'AL', 'TN', 'NC', 'KS', 'LA',
       'OK', 'CO', 'UT', 'AZ', 'MN', 'MS', 'NV', 'ME', 'AR', 'ID', 'VT',
       'NM', 'ND', 'WY', 'SD', 'MT'], dtype=object)

In [17]:
States_records =['CA']
# States_records =['VA']
# States_records =['OK']
# States_records =['MO']
# States_records =['KS']


In [18]:
# Function to filter dataframe by state
def filter_dataframe_by_state_name(df,state):
    updated_df = df[df['State']==state]
    return updated_df

In [19]:
# function filter US accidents by States
def concat_dataframe_by_states_US_Accidents(dataframe,States_records):
    data = []
    for state in States_records:
        data.append(filter_dataframe_by_state_name(dataframe,state))

    merged_t = pd.concat(data, axis=0)
    return merged_t
    

In [20]:
US_Accidents_Feb16_to_June20_data_by_states =concat_dataframe_by_states_US_Accidents(US_Accidents_Feb16_to_June20_data,States_records)
US_Accidents_Feb16_to_June20_data_by_states.shape

(816825, 49)

####  Feature Reduction for US_Accidents_Feb16_to_June20_data_by_states

In [18]:
# For US_Accidents_Feb16_to_June20_data_by_states dataset
#US_Accidents_Feb16_to_June20_data_by_states.shape

In [19]:
# del(US_Accidents_Feb16_to_June20_data)
# del(US_demographic_data)

In [21]:
# Retrieving records that has 70% of total records as null values
US_Accidents_Feb16_to_June20_data_by_states.isnull().sum().sort_values().tail(20)

Astronomical_Twilight         8
Nautical_Twilight             8
City                          8
Timezone                    307
Zipcode                     307
Airport_Code                328
Weather_Timestamp         12535
Pressure(in)              14783
Weather_Condition         16391
Visibility(mi)            16594
Wind_Direction            16729
Temperature(F)            19634
Humidity(%)               21010
Wind_Speed(mph)          133385
TMC                      330882
Wind_Chill(F)            477290
Precipitation(in)        485513
End_Lat                  485943
End_Lng                  485943
Number                   641322
dtype: int64

In [22]:
# Dropping features because 70% of data is null
US_Accidents_Feb16_to_June20_data_by_states.drop(['Precipitation(in)','Number', 'End_Lng', 'End_Lat','Wind_Chill(F)','Wind_Speed(mph)','TMC'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape


(816825, 42)

In [23]:
# Dropping features which doesn't add much value for model creation
US_Accidents_Feb16_to_June20_data_by_states.drop(['Country','Zipcode','ID','Source','Description','Street','Weather_Timestamp','Turning_Loop','City'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape
# Traffic_Signal, Turning_Loop


(816825, 33)

In [24]:
US_Accidents_Feb16_to_June20_data_by_states['Start_Time'] = pd.to_datetime(US_Accidents_Feb16_to_June20_data_by_states['Start_Time'])
US_Accidents_Feb16_to_June20_data_by_states['Start_Time_Year'] = US_Accidents_Feb16_to_June20_data_by_states['Start_Time'].dt.year
US_Accidents_Feb16_to_June20_data_by_states['Start_Time_Year'].unique()

array([2016, 2017, 2020, 2019, 2018], dtype=int64)

In [25]:
# Dropping features 'Start_Time' and 'End_Time'
US_Accidents_Feb16_to_June20_data_by_states.drop(['Start_Time','End_Time'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape

(816825, 32)

In [26]:
# Dropping features 'Start_Lat' and 'Start_Lng'
US_Accidents_Feb16_to_June20_data_by_states.drop(['Start_Lat','Start_Lng'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape

(816825, 30)

In [27]:

# Dropping features where 99% of data belongs to only one category there is no variance in data
# 'Turning_Loop', 'Traffic_Calming', 'Stop', 'Station', 'Roundabout', 'Railway','No_Exit','Give_Way','Crossing', 'Bump', 'Amenity'
US_Accidents_Feb16_to_June20_data_by_states.drop(['Traffic_Calming', 'Stop', 'Station', 'Roundabout', 'Railway','No_Exit','Give_Way','Crossing', 'Bump', 'Amenity'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape


(816825, 20)

In [28]:
# Dropping features 'Timezone', 'Airport_Code'
# Dropping feature Wind_Direction - can be used for SVM later
US_Accidents_Feb16_to_June20_data_by_states.drop(['Wind_Direction','Airport_Code'], axis=1, inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape

(816825, 18)

### Feature transformation 

In [29]:
US_Accidents_Feb16_to_June20_data_by_states.columns

Index(['Severity', 'Distance(mi)', 'Side', 'County', 'State', 'Timezone',
       'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Weather_Condition', 'Junction', 'Traffic_Signal', 'Sunrise_Sunset',
       'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
       'Start_Time_Year'],
      dtype='object')

#### Converting categorical values with two outcomes into binary value 

In [30]:
day_night_mapping = {'Day':0,'Night':1}

In [31]:
# mapping categorical values Day = 0 and night= 1
# 'Sunrise_Sunset', 'Civil_Twilight','Nautical_Twilight', 'Astronomical_Twilight'
US_Accidents_Feb16_to_June20_data_by_states.Sunrise_Sunset.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.Civil_Twilight.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.Nautical_Twilight.replace(dict(day_night_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.Astronomical_Twilight.replace(dict(day_night_mapping), inplace=True)


In [32]:
boolean_mapping = {True:1,False:0}
data_mapping = {'R':0,'L':1}

In [33]:
US_Accidents_Feb16_to_June20_data_by_states.Traffic_Signal.unique()

array([False,  True])

In [34]:
US_Accidents_Feb16_to_June20_data_by_states.Traffic_Signal.replace(dict(boolean_mapping),inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.Junction.replace(dict(boolean_mapping), inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.Side.replace(dict(data_mapping), inplace=True)

In [35]:
# Finding out columns with categorical values
US_Accidents_Feb16_to_June20_data_by_states.dtypes.unique() # unique data types 
US_Accidents_Feb16_to_June20_data_by_states.select_dtypes(exclude=['int','float']).columns #  Fetching columns with categorical values

Index(['Severity', 'Side', 'County', 'State', 'Timezone', 'Weather_Condition',
       'Junction', 'Traffic_Signal', 'Start_Time_Year'],
      dtype='object')

In [36]:
# Finding out columns with numerical values
US_Accidents_Feb16_to_June20_data_by_states.select_dtypes(include=['int','float']).columns

Index(['Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight'],
      dtype='object')

In [37]:
US_Accidents_Feb16_to_June20_data_by_states.shape

(816825, 18)

In [38]:
# Dropping null values from the dataframe
US_Accidents_Feb16_to_June20_data_by_states.dropna(inplace=True)
US_Accidents_Feb16_to_June20_data_by_states.shape

(790455, 18)

### Transformation and cleaning for demographic dataset

In [39]:
US_demographic_data_by_state.shape, US_demographic_data_by_state.columns

((13224, 80),
 Index(['SUMLEV', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP',
        'TOT_POP', 'TOT_MALE', 'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE',
        'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
        'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE', 'WAC_MALE', 'WAC_FEMALE',
        'BAC_MALE', 'BAC_FEMALE', 'IAC_MALE', 'IAC_FEMALE', 'AAC_MALE',
        'AAC_FEMALE', 'NAC_MALE', 'NAC_FEMALE', 'NH_MALE', 'NH_FEMALE',
        'NHWA_MALE', 'NHWA_FEMALE', 'NHBA_MALE', 'NHBA_FEMALE', 'NHIA_MALE',
        'NHIA_FEMALE', 'NHAA_MALE', 'NHAA_FEMALE', 'NHNA_MALE', 'NHNA_FEMALE',
        'NHTOM_MALE', 'NHTOM_FEMALE', 'NHWAC_MALE', 'NHWAC_FEMALE',
        'NHBAC_MALE', 'NHBAC_FEMALE', 'NHIAC_MALE', 'NHIAC_FEMALE',
        'NHAAC_MALE', 'NHAAC_FEMALE', 'NHNAC_MALE', 'NHNAC_FEMALE', 'H_MALE',
        'H_FEMALE', 'HWA_MALE', 'HWA_FEMALE', 'HBA_MALE', 'HBA_FEMALE',
        'HIA_MALE', 'HIA_FEMALE', 'HAA_MALE', 'HAA_FEMALE', 'HNA_MALE',
        'HNA_FEMALE

In [40]:
# Index for columns  'WAC_MALE'  'HNAC_FEMALE'
US_demographic_data_by_state.columns.get_indexer(['WAC_MALE','HNAC_FEMALE'])


array([22, 79], dtype=int64)

In [41]:
# We are interested in analysis based on Origin  not on subcategory  of each cultural origin group
# Dropping columns from 'WAC_MALE' to 'HNAC_FEMALE'
columns_drop = US_demographic_data_by_state.columns[22:80].values.tolist()
US_demographic_data_by_state.drop(columns_drop,1,inplace=True)
US_demographic_data_by_state.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(13224, 22)

In [42]:
# Drop demographic records with YEAR column from 1 to 7 representing 2010 to 2015
US_demographic_data_by_state = US_demographic_data_by_state[~(US_demographic_data_by_state["YEAR"].isin(['1','2','3','4','5','6','7','8']))]
US_demographic_data_by_state.shape

(4408, 22)

In [43]:
# Replace YEAR column with values  9 to 12 representing year to scale 2016 to 2020
US_demographic_data_by_state.YEAR.replace(dict(zip(range(9, 13), range(2016, 2020))), inplace=True)
US_demographic_data_by_state.shape

(4408, 22)

In [44]:
# Summation of following features by age group
Variable_Summation = ['TOT_POP', 'TOT_MALE', 'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE',
'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE']

In [45]:
# Grouping by State, County, Year and Agegroup
US_demographic_data_by_state = US_demographic_data_by_state.groupby(['STNAME','CTYNAME','YEAR','AGEGRP'])[Variable_Summation].sum().reset_index()
US_demographic_data_by_state.shape

(4408, 19)

In [46]:
US_demographic_data_by_state = pd.pivot_table(US_demographic_data_by_state,index=['STNAME','CTYNAME','YEAR'], columns=['AGEGRP'], aggfunc=np.sum).reset_index()
US_demographic_data_by_state.shape

(232, 288)

In [47]:
US_demographic_data_by_state.columns = [level_0 + '_' + str(level_1) for level_0, level_1 in zip(US_demographic_data_by_state.columns.get_level_values(0), US_demographic_data_by_state.columns.get_level_values(1))]
US_demographic_data_by_state.shape

(232, 288)

In [48]:
US_demographic_data_by_state.head(1)

Unnamed: 0,STNAME_,CTYNAME_,YEAR_,AA_FEMALE_0,AA_FEMALE_1,AA_FEMALE_2,AA_FEMALE_3,AA_FEMALE_4,AA_FEMALE_5,AA_FEMALE_6,...,WA_MALE_9,WA_MALE_10,WA_MALE_11,WA_MALE_12,WA_MALE_13,WA_MALE_14,WA_MALE_15,WA_MALE_16,WA_MALE_17,WA_MALE_18
0,California,Alameda County,2016,260648,12899,14190,13394,12936,14450,22457,...,28692,30933,30161,29280,24765,20012,13565,8406,5736,5955


### Bring both the dataframes ( US_demographic_data and US_Accidents_Feb16_to_June20_data_by_states) on same scale for easy merging

#### US_Accidents 'Start_Time_Year' and US_demographic 'Year'

In [49]:

US_Accidents_Feb16_to_June20_data_by_states['Start_Time_Year'].unique(), US_demographic_data_by_state.YEAR_.unique()

(array([2016, 2017, 2020, 2019, 2018], dtype=int64),
 array([2016, 2017, 2018, 2019], dtype=int64))

#### US_Accidents 'states' and US_demographic 'STNAME_'

In [50]:
US_demographic_data_by_state.STNAME_.unique(), US_Accidents_Feb16_to_June20_data_by_states.State.unique()

(array(['California'], dtype=object), array(['CA'], dtype=object))

In [51]:
state_mapping = {'Alabama':'AL', 'Alaska':'AK', 'Arizona':'AZ', 'Arkansas':'AR', 'California':'CA',
        'Colorado':'CO', 'Connecticut':'CT', 'Delaware':'DE', 'District of Columbia':'DC',
        'Florida':'FL', 'Georgia':'GA', 'Hawaii':'HI', 'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN',
        'Iowa':'IA', 'Kansas':'KS', 'Kentucky':'KY', 'Louisiana':'LA', 'Maine':'ME', 'Maryland':'MD',
        'Massachusetts':'MA', 'Michigan':'MI', 'Minnesota':'MN', 'Mississippi':'MS',
        'Missouri':'MO', 'Montana':'MT', 'Nebraska':'NE', 'Nevada':'NV', 'New Hampshire':'NH',
        'New Jersey':'NJ', 'New Mexico':'NM', 'New York':'NY', 'North Carolina':'NC',
        'North Dakota':'ND', 'Ohio':'OH', 'Oklahoma':'OK', 'Oregon':'OR', 'Pennsylvania':'PA',
        'Rhode Island':'RI', 'South Carolina':'SC', 'South Dakota':'SD', 'Tennessee':'TN',
        'Texas':'TX', 'Utah':'UT', 'Vermont':'VT', 'Virginia':'VA', 'Washington':'WA',
        'West Virginia':'WV', 'Wisconsin':'WI', 'Wyoming':'WY'}

In [52]:
US_demographic_data_by_state.STNAME_.replace(dict(state_mapping), inplace=True)
US_demographic_data_by_state.shape, US_demographic_data_by_state.STNAME_.unique(), US_Accidents_Feb16_to_June20_data_by_states.State.unique()

((232, 288), array(['CA'], dtype=object), array(['CA'], dtype=object))

#### US_Accidents 'County' and US_demographic 'CTYNAME'

In [53]:
US_demographic_data_by_state.CTYNAME_.value_counts().head(), US_Accidents_Feb16_to_June20_data_by_states.County.value_counts().head()

(San Mateo County     4
 San Benito County    4
 Kings County         4
 Napa County          4
 Modoc County         4
 Name: CTYNAME_, dtype: int64,
 Los Angeles    262283
 Alameda         53136
 Orange          50685
 San Diego       48617
 Sacramento      45272
 Name: County, dtype: int64)

In [53]:
# Remove extra whitespace in both the datasets

In [54]:
# Replace whitespace with ""
US_demographic_data_by_state = US_demographic_data_by_state.assign(CTYNAME_= US_demographic_data_by_state['CTYNAME_'].str.replace(" ",""))

In [55]:
# Replace whitespace with ""
US_Accidents_Feb16_to_June20_data_by_states = US_Accidents_Feb16_to_June20_data_by_states.assign(County= US_Accidents_Feb16_to_June20_data_by_states['County'].str.replace(" ",""))

In [56]:
# Replace County with ""
US_demographic_data_by_state = US_demographic_data_by_state.assign(CTYNAME_= US_demographic_data_by_state['CTYNAME_'].str.replace("County",""))

In [57]:
US_demographic_data_by_state.dropna()
US_demographic_data_by_state.shape

(232, 288)

In [58]:
# # Creating backup files 
# US_demographic_data_by_state.to_csv('US_demographic_data_KS_optimized.csv',index=False)
# US_Accidents_Feb16_to_June20_data_by_states.to_csv('US_Accidents_Feb16_to_June20_data_by_states_KS_optimized.csv',index=False)


# Optimized datasets after feature extraction and cleaning

In [59]:
# US_Accidents_Feb16_to_June20_data_by_states = pd.read_csv('US_Accidents_Feb16_to_June20_data_by_states.csv')
# US_Accidents_Feb16_to_June20_data_by_states.shape

In [60]:
# US_demographic_data =pd.read_csv('US_demographic_data.csv')
# US_demographic_data.shape

In [61]:
#############

### Merge the datasets US accidents and demographic data based on YEAR, STATE, COUNTY

In [62]:
%%time
US_Accidents_plus_Demographic_merged = US_Accidents_Feb16_to_June20_data_by_states.merge(US_demographic_data_by_state, how='left', left_on=["State","County","Start_Time_Year"], right_on=["STNAME_","CTYNAME_","YEAR_"])
US_Accidents_plus_Demographic_merged.shape

Wall time: 1.05 s


(790455, 306)

In [68]:
del(US_Accidents_Feb16_to_June20_data_by_states)
del(US_demographic_data)

In [64]:
# %%time
# # Creating backup files 
# US_Accidents_plus_Demographic_merged.to_csv('/content/drive/My Drive/Colab Notebooks/data/US_Accidents_plus_Demographic_merged.csv',index=False)

In [65]:
# US_Accidents_plus_Demographic_merged =pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/US_demographic_data.csv')
# US_Accidents_plus_Demographic_merged.shape

In [66]:
US_Accidents_plus_Demographic_merged.shape

(790455, 306)

In [67]:
# Droping duplicate features STNAME_, CTYNAME_, YEAR_
US_Accidents_plus_Demographic_merged.drop(['STNAME_', 'CTYNAME_', 'YEAR_','State'], axis=1, inplace=True)
US_Accidents_plus_Demographic_merged.shape

(790455, 302)

### Converting categorical values into numerical values using get_dummies 

In [69]:
# Categorical data types
US_Accidents_plus_Demographic_merged.select_dtypes(exclude=['int','float']).columns 

Index(['Severity', 'Side', 'County', 'Timezone', 'Weather_Condition',
       'Junction', 'Traffic_Signal', 'Start_Time_Year'],
      dtype='object')

In [70]:
US_Accidents_plus_Demographic_merged['Junction'].unique()

array([0, 1], dtype=int64)

In [71]:
margin = len(US_Accidents_plus_Demographic_merged)/ 100
margin

7904.55

In [72]:
# Get the count of each value
value_counts = US_Accidents_plus_Demographic_merged.Weather_Condition.value_counts()

# Select the values where the count is less than 1% of the data
to_remove = value_counts[value_counts <= margin].index

# Keep rows where the Weather_Condition column is not in to_remove
US_Accidents_plus_Demographic_merged = US_Accidents_plus_Demographic_merged[~US_Accidents_plus_Demographic_merged.Weather_Condition.isin(to_remove)]

In [73]:

Feature_list = ['Timezone','Weather_Condition']
US_Accidents_plus_Demographic_merged.shape

(771760, 302)

In [74]:
subset_dataframe_featurelist=[]
for variable in Feature_list:
    subset_dataframe_featurelist.append(pd.get_dummies(US_Accidents_plus_Demographic_merged[variable],prefix=variable, dummy_na=False))

In [75]:
merged_subset_dataframe_featurelist = pd.concat(subset_dataframe_featurelist, axis=1)
merged_subset_dataframe_featurelist.shape

(771760, 12)

In [76]:
%%time
US_Accidents_plus_Demographic_merged = pd.concat([merged_subset_dataframe_featurelist,US_Accidents_plus_Demographic_merged], axis=1)
US_Accidents_plus_Demographic_merged.shape

Wall time: 899 ms


(771760, 314)

In [77]:
# 'Side', 'State', 'Timezone','Weather_Condition', 
US_Accidents_plus_Demographic_merged.drop(['Side', 'Timezone','Weather_Condition'],axis=1, inplace=True)
US_Accidents_plus_Demographic_merged.shape

(771760, 311)

In [78]:
US_Accidents_plus_Demographic_merged.dropna(inplace = True)
US_Accidents_plus_Demographic_merged.shape

(629598, 311)

##### Severity as outcome binary values 0 or 1

In [79]:
US_Accidents_plus_Demographic_merged.Severity.unique()

array([3, 2, 1, 4], dtype=int64)

In [80]:
# Binary output
Severity_mapping = {1:0,2:0,3:1,4:1}
US_Accidents_plus_Demographic_merged.Severity.replace(dict(Severity_mapping), inplace=True)
US_Accidents_plus_Demographic_merged.shape

(629598, 311)

In [81]:
US_Accidents_plus_Demographic_merged.Severity.unique()

array([1, 0], dtype=int64)

In [82]:
# Split the training data into training and cross validation sets. 

In [83]:
# US_Accidents_plus_Demographic_merged.to_csv('US_Accidents_plus_Demographic_merged_without_na.csv',index=False)

In [84]:
y = pd.Series(US_Accidents_plus_Demographic_merged['Severity'])
y.shape

(629598,)

In [85]:
US_Accidents_plus_Demographic_merged.drop(['Severity'],axis=1, inplace=True)
US_Accidents_plus_Demographic_merged.shape

(629598, 310)

In [86]:
y_County = pd.Series(US_Accidents_plus_Demographic_merged.County)
y_County.shape

(629598,)

In [87]:
US_Accidents_plus_Demographic_merged.drop(['County'],axis=1, inplace=True)
US_Accidents_plus_Demographic_merged.shape

(629598, 309)

In [88]:
X= pd.DataFrame(US_Accidents_plus_Demographic_merged)
print(X.shape,y.shape)

(629598, 309) (629598,)


In [89]:
# Using train_test_split() to randomly split the given training dataset into training and test
# data by 70:30 percent ratio respectively. 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.shape,X_test.shape)

(440718, 309) (188880, 309)


In [90]:
y_test.value_counts()

0    126616
1     62264
Name: Severity, dtype: int64

###  Creating model using random forest

In [114]:
pipe = Pipeline([
    ('rf', RandomForestClassifier())
])

In [115]:
clfRandom = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [116]:
parameters = {
    'rf__n_estimators': [50,100],
    'rf__max_depth': [20,50],
    'rf__min_samples_split': [2,4],
}

In [108]:
parameters_2 = {
    'rf__n_estimators': [50,100,150],
    'rf__max_depth': [20,30,40,50],
    'rf__min_samples_split': [2,4,6],
}

In [117]:
# For state - California
clf_GS_Model_1 = GridSearchCV(pipe, parameters, scoring=make_scorer(f1_score))

In [119]:

%time clf_GS_Model_1.fit(X_train, y_train)

In [94]:
pd.DataFrame(clf_GS_Model_1.cv_results_).sort_values("rank_test_score").head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_depth,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,14.834043,0.139576,0.33182,0.0021,50,4,200,"{'rf__max_depth': 50, 'rf__min_samples_split':...",0.710208,0.702973,0.711693,0.715196,0.713496,0.710713,0.004218,1
5,22.323524,0.211414,0.499326,0.012138,50,4,300,"{'rf__max_depth': 50, 'rf__min_samples_split':...",0.708467,0.7066,0.707826,0.71409,0.714976,0.710392,0.003445,2
17,22.267755,0.119601,0.499916,0.008282,500,4,300,"{'rf__max_depth': 500, 'rf__min_samples_split'...",0.708094,0.705776,0.708954,0.710695,0.714909,0.709686,0.003055,3
15,7.373949,0.025912,0.17031,0.001025,500,4,100,"{'rf__max_depth': 500, 'rf__min_samples_split'...",0.704744,0.711543,0.706045,0.711305,0.713364,0.7094,0.003372,4
11,22.207189,0.177436,0.495126,0.004916,100,4,300,"{'rf__max_depth': 100, 'rf__min_samples_split'...",0.706151,0.706652,0.706449,0.713012,0.71188,0.708829,0.002979,5


##### Model- state - Virginia

In [111]:
clf_GS_Model_2 = GridSearchCV(pipe, parameters_2, scoring=make_scorer(f1_score))

In [113]:
# %time clf_GS_Model_2.fit(X_train, y_train)

In [None]:
# For state- Virginia
# Viewing The Best Parameters
print('Best n estimator:', clf_GS_Model_2.best_estimator_.get_params()['rf__n_estimators'])
print('Best max_depth:', clf_GS_Model_2.best_estimator_.get_params()['rf__max_depth'])
print('Best min_samples_split:', clf_GS_Model_2.best_estimator_.get_params()['rf__min_samples_split'])

In [None]:
y_train_pred = clf_GS_Model_2.best_estimator_.predict(X_train)
# y_train_pred = clfRandom.predict(X_train)

f1_score(y_train, y_train_pred)

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred= clf_GS_Model_2.best_estimator_.predict(X_test)
# y_cross_validation_pred= clfRandom.predict(X_cross_validation)
f1_score(y_test, y_test_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
accuracy_score(y_train, y_train_pred)

In [103]:
# Fore state -Virginia
clf_GS_Model_2 = GridSearchCV(pipe, parameters_2, scoring=make_scorer(f1_score))

In [104]:
%time clf_GS_Model_2.fit(X_train, y_train)

Wall time: 34min 25s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [105]:
# For state- Virginia
# Viewing The Best Parameters
print('Best n estimator:', clf_GS_Model_2.best_estimator_.get_params()['rf__n_estimators'])
print('Best max_depth:', clf_GS_Model_2.best_estimator_.get_params()['rf__max_depth'])
print('Best min_samples_split:', clf_GS_Model_2.best_estimator_.get_params()['rf__min_samples_split'])

Best n estimator: 200
Best max_depth: 20
Best min_samples_split: 2


In [107]:
y_train_pred = clf_GS_Model_2.best_estimator_.predict(X_train)
# y_train_pred = clfRandom.predict(X_train)

f1_score(y_train, y_train_pred)

0.9515538447094168

In [97]:
y_train.unique()

array([1, 0], dtype=int64)

In [108]:
confusion_matrix(y_train, y_train_pred)

array([[14712,   408],
       [  624, 10135]], dtype=int64)

In [109]:

y_test_pred= clf_GS_Model_2.best_estimator_.predict(X_test)
# y_cross_validation_pred= clfRandom.predict(X_cross_validation)
f1_score(y_test, y_test_pred)



0.7179046801202235

In [101]:
confusion_matrix(y_test, y_test_pred)

array([[5078, 1447],
       [1237, 3330]], dtype=int64)

In [9]:
# Using train_test_split() to randomly split the given training dataset into training and cross validation 
# data by 70:30 percent ratio respectively. 
X_train, X_cross_validation, y_train, y_cross_validation = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.shape,X_cross_validation.shape)

(636526, 312) (272797, 312)


In [10]:
del(X)
del(y)

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
# Used MinMaxScaler() function to further normalize features on the scale of (0 to 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
# Normalized traning data set
normalized_train = scaler.transform(X_train)

In [13]:
del(X_train)

In [14]:
normalized_train.shape

(636526, 312)

In [15]:
normalized_cv = scaler.transform(X_cross_validation)
normalized_cv.shape

(272797, 312)

In [16]:
del(X_cross_validation)

In [17]:
from sklearn.svm import SVC

In [None]:
%%time
clf = SVC(gamma='auto')
clf.fit(normalized_train, y_train)

  y = column_or_1d(y, warn=True)


In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier

In [20]:
n_estimators = 10

In [None]:
%%time
clf = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear',probability=True,class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators))
clf.fit(normalized_train, y_train)

##### Model-  state - Oklahoma

In [97]:
parameters = {
    'rf__n_estimators': [50,100,150],
    'rf__max_depth': [20,30,40,50],
    'rf__min_samples_split': [2,4],
}

In [98]:

clf_GS_Model_3 = GridSearchCV(pipe, parameters, scoring=make_scorer(f1_score))

In [99]:

%time clf_GS_Model_3.fit(X_train, y_train)

Wall time: 16min 42s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [100]:
# For state- Oklahoma
# Viewing The Best Parameters
print('Best n estimator:', clf_GS_Model_3.best_estimator_.get_params()['rf__n_estimators'])
print('Best max_depth:', clf_GS_Model_3.best_estimator_.get_params()['rf__max_depth'])
print('Best min_samples_split:', clf_GS_Model_3.best_estimator_.get_params()['rf__min_samples_split'])

Best n estimator: 50
Best max_depth: 40
Best min_samples_split: 2


In [101]:
y_train_pred = clf_GS_Model_3.best_estimator_.predict(X_train)
# y_train_pred = clfRandom.predict(X_train)

f1_score(y_train, y_train_pred)

0.8976324305910407

In [102]:
confusion_matrix(y_train, y_train_pred)

array([[29967,   103],
       [  498,  2635]], dtype=int64)

In [103]:
accuracy_score(y_train, y_train_pred)

0.9818992259735566

In [104]:
y_test_pred= clf_GS_Model_3.best_estimator_.predict(X_test)
# y_cross_validation_pred= clfRandom.predict(X_cross_validation)
f1_score(y_test, y_test_pred)


0.23393574297188754

In [105]:
confusion_matrix(y_test, y_test_pred)

array([[12471,   370],
       [ 1156,   233]], dtype=int64)

In [106]:
accuracy_score(y_test, y_test_pred)

0.8927617709065355

##### Model- state - Kansas

In [92]:
parameters = {
    'rf__n_estimators': [50,100,150],
    'rf__max_depth': [10,20,30,40],
    'rf__min_samples_split': [2,4],
}

In [93]:

clf_GS_Model_4 = GridSearchCV(pipe, parameters, scoring=make_scorer(f1_score))

In [94]:

%time clf_GS_Model_4.fit(X_train, y_train)

Wall time: 1min 3s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [95]:
# For state- Kansas
# Viewing The Best Parameters
print('Best n estimator:', clf_GS_Model_4.best_estimator_.get_params()['rf__n_estimators'])
print('Best max_depth:', clf_GS_Model_4.best_estimator_.get_params()['rf__max_depth'])
print('Best min_samples_split:', clf_GS_Model_4.best_estimator_.get_params()['rf__min_samples_split'])

Best n estimator: 150
Best max_depth: 10
Best min_samples_split: 4


In [96]:
y_train_pred = clf_GS_Model_4.best_estimator_.predict(X_train)

f1_score(y_train, y_train_pred)

0.8021534320323015

In [97]:
confusion_matrix(y_train, y_train_pred)

array([[1068,  398],
       [ 190, 1192]], dtype=int64)

In [98]:
accuracy_score(y_train, y_train_pred)

0.7935393258426966

In [99]:
y_test_pred= clf_GS_Model_4.best_estimator_.predict(X_test)
# y_cross_validation_pred= clfRandom.predict(X_cross_validation)
f1_score(y_test, y_test_pred)


0.7247915087187263

In [100]:
confusion_matrix(y_test, y_test_pred)

array([[380, 222],
       [141, 478]], dtype=int64)

In [101]:
accuracy_score(y_test, y_test_pred)

0.7027027027027027

##### Model- 5 state - Missouri

In [182]:
parameters = {
    'rf__n_estimators': [50,100,150],
    'rf__max_depth': [20,30,40,50],
    'rf__min_samples_split': [2,4],
}

In [183]:

clf_GS_Model_5 = GridSearchCV(pipe, parameters, scoring=make_scorer(f1_score))

In [184]:

%time clf_GS_Model_5.fit(X_train, y_train)

Wall time: 5min 7s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [185]:
# For state- Kansas
# Viewing The Best Parameters
print('Best n estimator:', clf_GS_Model_5.best_estimator_.get_params()['rf__n_estimators'])
print('Best max_depth:', clf_GS_Model_5.best_estimator_.get_params()['rf__max_depth'])
print('Best min_samples_split:', clf_GS_Model_5.best_estimator_.get_params()['rf__min_samples_split'])

Best n estimator: 150
Best max_depth: 20
Best min_samples_split: 4


In [186]:
y_train_pred = clf_GS_Model_5.best_estimator_.predict(X_train)

f1_score(y_train, y_train_pred)

0.9376755800645095

In [187]:
confusion_matrix(y_train, y_train_pred)

array([[5133,  310],
       [ 289, 4506]], dtype=int64)

In [188]:
accuracy_score(y_train, y_train_pred)

0.9414924789998046

In [189]:
y_test_pred= clf_GS_Model_5.best_estimator_.predict(X_test)
# y_cross_validation_pred= clfRandom.predict(X_cross_validation)
f1_score(y_test, y_test_pred)

0.7131967603620771

In [190]:
confusion_matrix(y_test, y_test_pred)

array([[1687,  619],
       [ 585, 1497]], dtype=int64)

In [191]:
accuracy_score(y_test, y_test_pred)

0.7256153144940748

### Using logistic regression

In [101]:
# Used MinMaxScaler() function to further normalize features on the scale of (0 to 1)
scaler = MinMaxScaler()
scaler.fit(X_train)
# Normalized traning data set
normalized_train = scaler.transform(X_train)

In [102]:
normalized_train.shape

(440718, 309)

In [103]:
normalized_test = scaler.transform(X_test)
normalized_test.shape

(188880, 309)

In [104]:
model_Logistic_CA = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [105]:
y_train_pred = model_Logistic_CA.predict(normalized_train)

In [106]:
confusion_matrix(y_train, y_train_pred)

array([[268616,  26557],
       [108321,  37224]], dtype=int64)

In [107]:
f1_score(y_train, y_train_pred)

0.3556557713805261

In [108]:
accuracy_score(y_train, y_train_pred)

0.6939584950013388

In [110]:
y_test_pred = model_Logistic_CA.predict(normalized_test)

In [111]:
confusion_matrix(y_test, y_test_pred)

array([[115297,  11319],
       [ 46363,  15901]], dtype=int64)

In [112]:
f1_score(y_test, y_test_pred)

0.3553931429082293

In [113]:
accuracy_score(y_test, y_test_pred)

0.6946103346039814

### Model state - Virginia

In [97]:

model_Logistic_VA = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [98]:
y_train_pred = model_Logistic_VA.predict(normalized_train)

In [99]:
confusion_matrix(y_train, y_train_pred)

array([[11577,  3543],
       [ 3601,  7158]], dtype=int64)

In [100]:
f1_score(y_train, y_train_pred)

0.6671015843429637

In [101]:
accuracy_score(y_train, y_train_pred)

0.7239460566482476

In [103]:
y_test_pred = model_Logistic_VA.predict(normalized_test)

In [104]:
confusion_matrix(y_test, y_test_pred)

array([[4952, 1573],
       [1535, 3032]], dtype=int64)

In [105]:
f1_score(y_test, y_test_pred)

0.6611426079372001

In [106]:
accuracy_score(y_test, y_test_pred)

0.719798052650559

### Model- Logistic Regression state - Kansas

In [98]:

model_Logistic = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [99]:
y_train_pred = model_Logistic.predict(normalized_train)

In [100]:
confusion_matrix(y_train, y_train_pred)

array([[852, 614],
       [401, 981]], dtype=int64)

In [101]:
f1_score(y_train, y_train_pred)

0.6590527376553578

In [102]:
accuracy_score(y_train, y_train_pred)

0.6436095505617978

In [103]:
y_test_pred = model_Logistic.predict(normalized_test)

In [104]:
confusion_matrix(y_test, y_test_pred)

array([[327, 275],
       [177, 442]], dtype=int64)

In [105]:
f1_score(y_test, y_test_pred)

0.6616766467065868

In [106]:
accuracy_score(y_test, y_test_pred)

0.6298116298116299

#### Model - Logistic Regression-  state - Missouri

In [298]:

model_Logistic_MO = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [299]:
y_train_pred = model_Logistic_MO.predict(normalized_train)

In [303]:
# confusion_matrix(y_train, y_train_pred)

In [302]:
# f1_score(y_train, y_train_pred)

In [200]:
accuracy_score(y_train, y_train_pred)

0.6799179527251417

In [202]:
y_test_pred = model_Logistic_MO.predict(normalized_test)

In [203]:
confusion_matrix(y_test, y_test_pred)

array([[1520,  786],
       [ 693, 1389]], dtype=int64)

In [205]:
f1_score(y_test, y_test_pred)

0.6525722339675829

In [204]:
accuracy_score(y_test, y_test_pred)

0.6629443938012762

##### Model Logistic Regression -  state - Okalhoma

In [304]:

model_Logistic_OK = LogisticRegression(solver='liblinear', random_state=0).fit(normalized_train, y_train)

In [305]:
y_train_pred = model_Logistic_MO.predict(normalized_train)

In [306]:
confusion_matrix(y_train, y_train_pred)

array([[30014,    56],
       [ 2992,   141]], dtype=int64)

In [307]:
f1_score(y_train, y_train_pred)

0.0846846846846847

In [308]:
accuracy_score(y_train, y_train_pred)

0.9082010661687197

In [309]:
y_test_pred = model_Logistic_MO.predict(normalized_test)

In [310]:
confusion_matrix(y_test, y_test_pred)

array([[12820,    21],
       [ 1328,    61]], dtype=int64)

In [311]:
f1_score(y_test, y_test_pred)

0.08293677770224336

In [312]:
accuracy_score(y_test, y_test_pred)

0.9052002810962755

### Creating model using decision tree

In [91]:
model_DT = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=40,
            max_features=100, max_leaf_nodes=16,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

#### Model - Decision Tree-  state - california

In [92]:

clf_DT = model_DT.fit(X_train, y_train)



In [93]:

y_train_pred = clf_DT.predict(X_train)

In [94]:
confusion_matrix(y_train, y_train_pred)

array([[247627,  47546],
       [ 74430,  71115]], dtype=int64)

In [95]:
f1_score(y_train, y_train_pred)

0.5383299395168922

In [96]:
accuracy_score(y_train, y_train_pred)

0.7232334508688095

In [97]:
y_test_pred = clf_DT.predict(X_test)

In [98]:
f1_score(y_test, y_test_pred)

0.5402940315325306

In [99]:
accuracy_score(y_test, y_test_pred)

0.7256829733163913

In [100]:
confusion_matrix(y_test, y_test_pred)

array([[106619,  19997],
       [ 31816,  30448]], dtype=int64)

#### Model - Decision Tree -  state - Virginia

In [84]:

clf_DT_VA = model_DT.fit(X_train, y_train)



In [85]:

y_train_pred = clf_DT_VA.predict(X_train)

In [86]:
confusion_matrix(y_train, y_train_pred)

array([[10784,  4336],
       [ 2474,  8285]], dtype=int64)

In [87]:
f1_score(y_train, y_train_pred)

0.7087254063301968

In [88]:
accuracy_score(y_train, y_train_pred)

0.7368522740445922

In [90]:
y_test_pred = clf_DT_VA.predict(X_test)

In [91]:
f1_score(y_test, y_test_pred)

0.7040529695024077

In [92]:
accuracy_score(y_test, y_test_pred)

0.7340425531914894

In [93]:
confusion_matrix(y_test, y_test_pred)

array([[4633, 1892],
       [1058, 3509]], dtype=int64)

#### Model -  Decision Tree - state - Oklahoma

In [285]:
clf_DT = model_DT.fit(X_train, y_train)



In [286]:

y_train_pred = clf_DT.predict(X_train)

In [287]:
confusion_matrix(y_train, y_train_pred)

array([[29834,   236],
       [ 2732,   401]], dtype=int64)

In [288]:
f1_score(y_train, y_train_pred)

0.21273209549071617

In [289]:
accuracy_score(y_train, y_train_pred)

0.9106104870041863

In [290]:
y_test_pred = clf_DT.predict(X_test)

In [291]:
f1_score(y_test, y_test_pred)

0.1984365604329525

In [292]:
accuracy_score(y_test, y_test_pred)

0.9063246661981729

In [293]:
confusion_matrix(y_test, y_test_pred)

array([[12732,   109],
       [ 1224,   165]], dtype=int64)

#### Model - Decision Tree state - Kansas

In [85]:

clf_DT = model_DT.fit(X_train, y_train)



In [87]:

y_train_pred = clf_DT.predict(X_train)

In [88]:
confusion_matrix(y_train, y_train_pred)

array([[ 796,  670],
       [ 325, 1057]], dtype=int64)

In [89]:
f1_score(y_train, y_train_pred)

0.6799614023801865

In [90]:
accuracy_score(y_train, y_train_pred)

0.6506320224719101

In [91]:
y_test_pred = clf_DT.predict(X_test)

In [92]:
f1_score(y_test, y_test_pred)

0.6821144098479363

In [93]:
accuracy_score(y_test, y_test_pred)

0.6404586404586404

In [94]:
confusion_matrix(y_test, y_test_pred)

array([[311, 291],
       [148, 471]], dtype=int64)

#### Model  Decision Tree- state - Missouri

In [184]:

clf_DT = model_DT.fit(X_train, y_train)



In [185]:

y_train_pred = clf_DT.predict(X_train)

In [186]:
confusion_matrix(y_train, y_train_pred)

array([[3439, 2004],
       [1001, 3794]], dtype=int64)

In [187]:
f1_score(y_train, y_train_pred)

0.7163220994996695

In [188]:
accuracy_score(y_train, y_train_pred)

0.7064856417268998

In [189]:
y_test_pred = clf_DT.predict(X_test)

In [190]:
f1_score(y_test, y_test_pred)

0.7091858946451893

In [191]:
accuracy_score(y_test, y_test_pred)

0.6955332725615314

In [192]:
confusion_matrix(y_test, y_test_pred)

array([[1423,  883],
       [ 453, 1629]], dtype=int64)