# Monroe County Revisited

Originally an exercise at Lighthouse Labs, I wanted to come back to this to expand a little bit.

In [None]:
import pandas as pd
import requests as re
import numpy as np
import os
from IPython.display import JSON
import time
from geopy import distance
#import json

## Data Cleaning

The data can be found [here](https://drive.google.com/file/d/1_KF9oIJV8cB8i3ngA4JPOLWIE_ETE6CJ/view?usp=sharing).


In [None]:
accidents_df = pd.read_csv("monroe-county-crash-data2003-to-2015.csv", encoding="unicode_escape")

# preparing data
accidents_df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
# creation of variable with lon and lat together
accidents_df['ll'] = accidents_df['Latitude'].astype(str) + ',' + accidents_df['Longitude'].astype(str)
# remove 0 lat and lon
accidents_df = accidents_df[accidents_df['ll'] != '0.0,0.0']

# rename columns
accidents_df.columns =[column.replace(" ", "_") for column in accidents_df.columns]
print(accidents_df.shape)
accidents_df.head()

In [None]:
# Make date from columns Year, Month, Day
accidents_df['Date'] = pd.to_datetime(accidents_df[['Year', 'Month', 'Day']])

In [None]:
# Impute Weekend? from Date due to missing values
accidents_df['Weekend'] = accidents_df['Date'].dt.dayofweek >= 5

In [None]:
# drop NaN Hour
accidents_df.dropna(subset=['Hour'], inplace=True)
# drop NaN Collision_Type
accidents_df.dropna(subset=['Collision_Type'], inplace=True)
# drop NaN Primary_Factor
accidents_df.dropna(subset=['Primary_Factor'], inplace=True)

In [None]:
# format Hour from float to int
accidents_df['Hour'] = accidents_df['Hour'].astype(int)

In [None]:
# make timestamp column from accidents_df Date and Hour columns
accidents_df['Timestamp'] = pd.to_datetime(accidents_df['Date'].astype(str) + accidents_df['Hour'].astype(str).str.zfill(4), format='%Y-%m-%d%H%M')

In [None]:
# add feature night to indicate if it was dark
accidents_df['Night'] = (accidents_df['Hour'] >= 18) | (accidents_df['Hour'] <= 6)

In [None]:
# selecting only the columns we need
accidents_df = accidents_df[['Timestamp', 'Weekend', 'Night', 'Collision_Type', 'Injury_Type', 'Primary_Factor', 'll']]

In [None]:
# show Primary_Factor types
accidents_df['Primary_Factor'].value_counts()

In [None]:
# Check NaN
accidents_df.isnull().sum()

In [None]:
accidents_df.head()

# Foursquare API

Foursquare API documentation is [here](https://developer.foursquare.com/)

1. Start a foursquare application and get your keys.
2. For each crash, create the function **get_venues** that will pull bars in the radius of 5km around the crash

#### example
`get_venues('48.146394, 17.107969')`

3. Find a relationship (if there is any) between number of bars in the area and severity of the crash.

HINTs: 
- check out python package "foursquare" (no need to send HTTP requests directly with library `requests`)
- **categoryId** for bars and nightlife needs to be found in the [foursquare API documentation](https://developer.foursquare.com/docs/api-reference/venues/search/)

In [None]:
#set the keys
foursquare_id = os.environ["FS_CLIENT_ID"]
foursquare_secret = os.environ["FS_CL_SECRET"]
foursquare_api = os.environ["FS_API_KEY"]

In [None]:
#initialize FS API
headers = {

    "Accept": "application/json",

    "Authorization": foursquare_api

}
url="https://api.foursquare.com/v3/places/search"
radius = "&radius=10000"
limit = "&limit=50"
citycenter="39.1676747,-86.5314594"

In [None]:
def fs_get_rect(northeast, southwest):
    params = {
		"query": "bar",
  	"ne": northeast,
  	"sw": southwest
	}
    response = re.request("GET", url, params=params, headers=headers)
    if response.status_code != 200:
        print("Error:", response.status_code)
        return None
    else:
        barset = pd.DataFrame(columns=['name', 'lat', 'lng'])
        data = response.json()
        bars = data['results']
        for bar in bars:
            barset = barset.append({
                'name': bar['name'],
                'lat': bar['geocodes']['main']['latitude'],
                'lng': bar['geocodes']['main']['longitude']
            }, ignore_index=True)
        return barset

In [None]:
# test fs_get_rect
resulttest = fs_get_rect("39.2525,-86.3656", "39.2400,-86.4656")

In [None]:
resulttest

In [None]:
# get all bar locations from FS API using rectangular boundary
def get_venues_loop(start_lat, start_lon, end_lat, end_lon):
	lat_point = start_lat
	lon_point = start_lon
	lat_step = 0.0292
	lon_step = 0.0155
	all_bars = pd.DataFrame(columns=['name', 'lat', 'lng'])

	#iterate through the rectangle
	while lat_point > end_lat:
		while lon_point > end_lon:
			# get the response from the FS API
			response = fs_get_rect(str(lat_point)+","+str(lon_point), str(lat_point - lat_step)+","+str(lon_point - lon_step))
			if response is not None:
				all_bars = all_bars.append(response, ignore_index=True)
			# sleep for 5 second to avoid rate limiting
			lon_point -= lon_step
			time.sleep(5)
		lon_point = start_lon
		lat_point -= lat_step
	return all_bars
		

In [None]:
# starting coordinates is roughly top right corner of where the accidents are
# end point is bottom left corner
start_lat = 39.3525
end_lat = 39.0425
start_lon = -86.3656
end_lon = -86.7067

In [None]:
bars = get_venues_loop(start_lat, start_lon, end_lat, end_lon)

In [None]:
bars

In [None]:
# remove duplicates
bars.drop_duplicates(inplace=True)

In [None]:
# create ll column from lat and lng
bars['ll'] = bars['lat'].astype(str) + ',' + bars['lng'].astype(str)

In [None]:
bars.to_csv('bars.csv', index=False)

In [None]:
bars = pd.read_csv('bars.csv')

In [None]:
bars


In [None]:
accidents_df['closest_bar'] = np.nan
accidents_df['number_of_bars_1km'] = np.nan
accidents_df['number_of_bars_3km'] = np.nan

In [None]:
# for each accident add a column with the closest bar distance and columns with amount of bars in 1km and 3km radius

for acc_index, accident in accidents_df.iterrows():
    closest_bar = 9000
    number_of_bars_3 = 0
    number_of_bars_1 = 0
    for _, bar in bars.iterrows():
        distance_from_bar = distance.distance(accident['ll'], bar['ll']).kilometers
        if distance_from_bar < 1:
            number_of_bars_1 += 1
        if distance_from_bar < 3:
            number_of_bars_3 += 1
        if distance_from_bar < closest_bar:
            closest_bar = distance_from_bar
    accidents_df.loc[acc_index, 'closest_bar'] = closest_bar
    accidents_df.loc[acc_index, 'number_of_bars_1km'] = number_of_bars_1
    accidents_df.loc[acc_index, 'number_of_bars_3km'] = number_of_bars_3

In [None]:
accidents_df

### Mesonet ASOS weather data

https://mesonet.agron.iastate.edu/ASOS

In [None]:
weather_df = pd.read_csv("BMG.csv")

In [None]:
weather_df.rename(columns = {'valid': 'Time', 'sknt':'Wind_Speed', 'p01i':'Precipitation',
                              'vsby':'Visibility', 'gust':'Wind_Gust', 'wxcodes': 'Weather Codes', 'ice_accretion_1hr': 'Ice_Accretion'}, inplace = True)

In [None]:
# make timestamp out of 'Time' in weather_df
weather_df['Time'] = pd.to_datetime(weather_df['Time'])

Weather phenomena:
RA Rain SN Snow SG Snow Grains
DZ Drizzle IC Ice Crystals PL Ice pellets (sleet)
GS Small hail GR Hail UP Unknown precipitation
Obscurations to visibility:
BR Mist (>=5/8 mi) FG Fog (< 5/8 mi)
FU Smoke VA Volcanic Ash
SA Sand HZ Haze
PY Spray DU Widespread Dust
Other:
SQ Squall (strong wind) SS Sandstorm
DS Duststorm PO Dust/sand whirls
FC Funnel Cloud FC+ Tornado/waterspout
Qualifiers (for RA, DZ, SN, PL):
- Light
(No sign) Moderate
+ Heavy
VC Vicinity
Examples:
+RA Heavy Rain
-DZ Light Drizzle
SN Moderate Snow
VCTS Thunderstorm in the vicinity (5-10 mi from observation)
Other Descriptors:
MI Shallow BC Patches PR Partial
TS Thunderstorm BL Blowing SH Showers
DR Drifting FZ Freezing
Examples:
BCFG Patchy fog
+TSRA Thunderstorm with heavy rain
BLSN Blowing snow
SHRA Moderate rain showers
TSRAGR Thunderstorm with moderate rain and hail

In [None]:
weather_df

In [None]:
# append weather data to each accident. match closest timestamp
for acc_index, accident in accidents_df.iterrows():
    # get the closes timestamp in weather_df
    closest_time = weather_df.iloc[(weather_df['Time']-accident['Timestamp']).abs().argsort()[:1]]

    # append weather data to accident
    accidents_df.loc[acc_index, 'Wind_Speed'] = closest_time['Wind_Speed'].values[0]
    accidents_df.loc[acc_index, 'Precipitation'] = closest_time['Precipitation'].values[0]
    accidents_df.loc[acc_index, 'Visibility'] = closest_time['Visibility'].values[0]
    accidents_df.loc[acc_index, 'Wind_Gust'] = closest_time['Wind_Gust'].values[0]
    accidents_df.loc[acc_index, 'Weather Codes'] = closest_time['Weather Codes'].values[0]
    accidents_df.loc[acc_index, 'Ice_Accretion'] = closest_time['Ice_Accretion'].values[0]
    
    

In [None]:
accidents_df

In [None]:
accidents_df.to_csv("accidents.csv", index=False)

In [None]:
accidents_df = pd.read_csv("accidents.csv")

# Data Preparation

In [None]:
# encode Injury_Type from 0 to 3
accidents_df['Injury_Type'] = accidents_df['Injury_Type'].map({'No injury/unknown': 0, 'Non-incapacitating': 1, 'Incapacitating': 2, 'Fatal': 3})


In [None]:
accidents_df['Weather Codes'].unique()

In [None]:
# some weather codes are combined. For training purposes we will split them using regex
four_letters_regex = r"\w{4}"

In [None]:
# for each 'Weather Codes' entry, split four letter codes in two
accidents_df['Weather Codes'] = accidents_df['Weather Codes'].str.replace(four_letters_regex, lambda m: m.group(0)[:2] + ' ' + m.group(0)[2:])

In [None]:
accidents_df['Weather Codes'] = accidents_df['Weather Codes'].str.split(' ')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# encode Weather Codes
mlb = MultiLabelBinarizer()
weather_codes_encoded = pd.DataFrame(mlb.fit_transform(accidents_df['Weather Codes']), columns=mlb.classes_, index=accidents_df.index)

In [None]:
weather_codes_encoded

In [None]:
# add weather codes to accidents_df
accidents_df = pd.concat([accidents_df, weather_codes_encoded], axis=1)
accidents_df.drop(columns=['Weather Codes'], inplace=True)

In [None]:
# one-hot encode Collision_Type
accidents_df = pd.get_dummies(accidents_df, columns=['Collision_Type'])

In [None]:
# one-hot encode Primary_Factor
accidents_df = pd.get_dummies(accidents_df, columns=['Primary_Factor'])

In [None]:
# one-hot encode Weekend?
accidents_df = pd.get_dummies(accidents_df, columns=['Weekend?'])

In [None]:
accidents_df.columns

In [None]:
accidents_df['Ice_Accretion'].value_counts()

In [None]:
# In 'Precipitation', replace M with 0 and T with 0.001
accidents_df['Precipitation'] = accidents_df['Precipitation'].str.replace('M', '0')
accidents_df['Precipitation'] = accidents_df['Precipitation'].str.replace('T', '0.0005')


In [None]:
# In 'Wind_Speed', replace M with 0 and T with 0.001
accidents_df['Wind_Speed'] = accidents_df['Wind_Speed'].str.replace('M', '0')

In [None]:
# In 'Visibility', replace M with 0
accidents_df['Visibility'] = accidents_df['Visibility'].str.replace('M', '0')

In [None]:
# In 'Wind_Gust', replace M with 0
accidents_df['Wind_Gust'] = accidents_df['Wind_Gust'].str.replace('M', '0')

In [None]:
# In Ice_Accretion, replace M with 0 and T with 0.001
accidents_df['Ice_Accretion'] = accidents_df['Ice_Accretion'].str.replace('M', '0')
accidents_df['Ice_Accretion'] = accidents_df['Ice_Accretion'].str.replace('T', '0.0005')

In [None]:
# Remove 'Unnamed: 0' column that was created when saving to csv without index=False
accidents_df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
accidents_df.columns

In [None]:
# drop columns not needed for training
accidents_df.drop(columns=['Timestamp', 'll'], inplace=True)

In [None]:
# Scale 'number_of_bars_1km', 'number_of_bars_3km', 'closest_bar', 'Wind_Speed', 'Visibility' between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
accidents_df[['number_of_bars_1km', 'number_of_bars_3km', 'closest_bar', 'Wind_Speed', 'Precipitation', 'Visibility', 'Wind_Gust', 'Ice_Accretion']] = scaler.fit_transform(accidents_df[['number_of_bars_1km', 'number_of_bars_3km', 'closest_bar', 'Wind_Speed', 'Precipitation', 'Visibility', 'Wind_Gust', 'Ice_Accretion']])

In [None]:
accidents_df

## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X = accidents_df.drop(columns=['Injury_Type'])
y = accidents_df['Injury_Type']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# set up param grid for logistic regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga', 'lbfgs']
}


In [None]:
# set up gridsearch for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=1)


In [None]:
# fit gridsearch
grid.fit(x_train, y_train)

In [None]:
# show best parameters
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
# create logistic regression model with best parameters
clf = LogisticRegression(C=100, penalty='l2', solver='lbfgs', max_iter=1000)


In [None]:
clf.fit(x_train, y_train)

In [None]:
# ROC AOC
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# predict probabilities
probs = clf.predict_proba(x_test)


## Random Forest