In [1]:
# env pre-setting
import os
import conda

conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib

In [2]:
#importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [3]:
import folium
from folium import plugins
from mpl_toolkits.basemap import Basemap

ModuleNotFoundError: No module named 'mpl_toolkits.basemap'

In [None]:
# select michigan data
# sava it as MI_data.csv
# data = pd.read_csv("/Users/weijiepan/Desktop/US_Accidents_Dec19.csv")
# MI_data = data[data['State'] == 'MI'] 
# MI_data.to_csv("/Users/weijiepan/Desktop/MI_data.csv")

In [None]:
data = pd.read_csv("/Users/weijiepan/Desktop/MI_data.csv")

In [None]:
# pre-process time 
data['Start_Time'] = pd.to_datetime(data['Start_Time'])
data['End_Time'] = pd.to_datetime(data['End_Time'])
data['Year'] = data['Start_Time'].dt.year
data['Hour'] = data['Start_Time'].dt.hour
data['Day'] = data['Start_Time'].dt.dayofweek
data['DayName'] = data['Start_Time'].dt.weekday_name
data['Month'] = data['Start_Time'].dt.month

# two in 2020 (del them)
data = data[(data['Year']>2015) & (data['Year']<2020)]

In [None]:
# location
heat_map_data = [[i,j,k] for i,j,k in zip(data.Start_Lat,data.Start_Lng,data.Severity)]

# michigan state location
latitude = 43.182205
longitude = -84.506836
# Add incidents to map
mi_map = folium.Map(location=[latitude , longitude], zoom_start=7)

folium.plugins.HeatMap(heat_map_data,min_opacity = 0.1).add_to(mi_map)
mi_map 
# save the heatmap
# mi_map.save(outfile= "map.html")

In [None]:
# time 
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 8))
fig.tight_layout(pad=3)

hours = data.groupby('Hour').size()
ax1.bar(hours.index, hours)
ax1.set_xticks(hours.index)
ax1.set_xlabel('Factor (Hour)')

day_names = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
days = data.groupby('Day').size()
ax2.bar(days.index, days)
ax2.set_xticks(days.index)
ax2.set_xticklabels([day_names[i] for i in days.index])
ax2.set_xlabel('Factor (Day)')

month_names = ["Jan","Feb","Mar","Apr","May",
              "Jun","Jul","Aug","Sep","Oct",
              "Nov","Dec"]
months = data.groupby('Month').size()
ax3.bar(months.index, months)
ax3.set_xticks(months.index)
ax3.set_xticklabels([month_names[i-1] for i in months.index])
ax3.set_xlabel('Factor (Month)')

years = data.groupby('Year').size()
ax4.bar(years.index, years)
ax4.set_xticks(years.index)
ax4.set_xlabel('Factor (Year)')

In [None]:
plt.figure(figsize=(14, 8))
sns.countplot(y='Weather_Condition', data=data, order=data['Weather_Condition'].value_counts().iloc[:10].index)

In [None]:
data.columns

In [None]:
# select vars
var = ['Start_Lat','Start_Lng',
        'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Sunrise_Sunset','Hour','Severity']

con_var = ['Start_Lat','Start_Lng',
        'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)']
subdata = data[var]

# standardize
subdata[con_var] = (subdata[con_var]-subdata[con_var].mean())/subdata[con_var].std()

# SUNRISE _ SUNSET
# day = 1 && night = 0
subdata['Sunrise_Sunset'][subdata['Sunrise_Sunset'] == "Day"] = 1
subdata['Sunrise_Sunset'][subdata['Sunrise_Sunset'] == "Night"] = 0

# drop na
subdata.dropna(axis=0,how='any',inplace=True)

# find weather top 10 
weather_index = list(data['Weather_Condition'].value_counts()[:10].index)
weather_index

# select top 10 weather
subdata = subdata[subdata['Weather_Condition'].isin(weather_index)]

# one - hot encode weather 
subdata=pd.concat([pd.get_dummies(subdata["Weather_Condition"]),subdata],axis=1)

# drop Weather and save
subdata = subdata.drop(columns=["Weather_Condition"])
#subdata.to_csv("/Users/weijiepan/Desktop/MI_data_processed.csv")

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    subdata.drop(columns = ['Severity']), subdata['Severity'], test_size=0.1, random_state=42)

In [None]:
# IMP OF FEATURE
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
print(model.feature_importances_) 
#use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
#build the model
from sklearn.metrics import accuracy_score 
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(cv=5,class_weight={2:.08,3:.13,4:.79},max_iter=1000)
model.fit(X_train,y_train)

In [None]:
sum(model.predict(X_test)==y_test)/len(y_test)

In [None]:
clf = RandomForestClassifier(max_depth=8, criterion='entropy',random_state=1,class_weight={2:.08,3:.13,4:.79})
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from keras.models import Sequential

from keras.layers import Dense, Conv1D, Flatten
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=31, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#data
y_train_nn=pd.get_dummies(y_train)
y_test_nn=pd.get_dummies(y_test)
model.fit(X_train, y_nn, validation_data=(X_test, y_test_nn), epochs=10,class_weight={0:.0,1:.08,2:.13,3:.79})