# Logistic Regression - Predicting Risks

I am interested in finding the best model. But first, I will use default settings. Then, I will use a random grid search to fine tune parameters. Then grid search to narrow down the paramerters I should use. Finally, I will compare this best model to the one produced by the default settings. I am going to try **logistic regression** and **random forest**.

In [None]:
from sklearn.model_selection import train_test_split # to measure within momdel acuracy
from sklearn.preprocessing import StandardScaler # need to scale data for logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import osmnx
import pickle

In [None]:
# loading data
robbery_model_data_dummies_bal = pandas.read_pickle('C:/Users/jodyn/Google Drive/Insight/Processed Data/robbery_model_data_dummies_bal.pkl.pkl')
pedestrian_model_data_dummies_bal = pandas.read_pickle('C:/Users/jodyn/Google Drive/Insight/Processed Data/pedestrian_model_data_dummies_bal.pkl')

In [None]:
# Labels are the values we want to predict
robbery_labels = robbery_model_data_dummies_bal['Presence_Absence']
robbery_features = robbery_model_data_dummies_bal.iloc[:, numpy.r_[1:4, 9:15]]
robbery_model_names = list(robbery_model_data_dummies_bal.iloc[:, numpy.r_[1:4, 9:15]])

pedestrian_labels = pedestrian_model_data_dummies_bal['Presence_Absence']
pedestrian_features = pedestrian_model_data_dummies_bal.iloc[:, numpy.r_[0:1, 5:10,12:len(list(pedestrian_model_data_dummies_bal))]]
pedestrian_features_names = list(pedestrian_model_data_dummies_bal.iloc[:, numpy.r_[0:1, 5:10,12:len(list(pedestrian_model_data_dummies_bal))]])

# Split the data into training and testing sets
robbery_train, robbery_test, robbery_presence_train, robbery_presence_test = train_test_split(
    robbery_features,
    robbery_labels, test_size=0.3, random_state=42)
pedestrian_train, pedestrian_test, pedestrian_presence_train, pedestrian_presence_test = train_test_split(
    pedestrian_features,pedestrian_labels, test_size=0.3, random_state=42)

# Scale data for logistic regression
scaler = StandardScaler()
robbery_train_lr = pandas.DataFrame(scaler.fit_transform(robbery_train),
                                   columns=robbery_model_names)

robbery_test_lr = pandas.DataFrame(scaler.fit_transform(robbery_test),
                                  columns=robbery_model_names)

scaler = StandardScaler()
pedestrian_train_lr = pandas.DataFrame(scaler.fit_transform(pedestrian_train),
                                   columns=pedestrian_features_names)

pedestrian_test_lr = pandas.DataFrame(scaler.fit_transform(pedestrian_test),
                                  columns=pedestrian_features_names)

# Time to build the base model
robbery_model_LR = LogisticRegression()
robbery_model_LR.fit(robbery_train_lr, robbery_presence_train_lr)
robbery_model_prediction_LR = robbery_model_LR.predict(robbery_test_lr)

pedestrian_model_LR = LogisticRegression()
pedestrian_model_LR.fit(pedestrian_train_lr, pedestrian_presence_train_lr)
pedestrian_model_prediction_LR = pedestrian_model_LR.predict(pedestrian_test_lr)

# how did these models perform?
print(classification_report(robbery_presence_test, robbery_model_prediction_LR))
print(classification_report(pedestrian_presence_test, pedestrian_model_prediction_LR))


# Preparing Path Data - OSMNX

In [None]:
walk_path_GTA = osmnx.graph_from_place(place_name, network_type='walk')

# need to reproject to UTM for later analysis
walk_path_GTA_proj = osmnx.project_graph(walk_path_GTA)

# need edges and nodes to append data to
walk_nodes_proj, walk_edges_proj = osmnx.graph_to_gdfs(walk_path_GTA, nodes=True, edges=True)

# making sure its in NAD 1983 and save
walk_nodes_proj = walk_nodes_proj.to_crs(epsg = 2958)

walk_nodes_proj['Index'] = walk_nodes_proj.index

# converting the node data to a geopanda dataframe so that I can extract elevation data
walk_node_id = pandas.DataFrame(walk_nodes_proj)
walk_nodes_gdf = create_gdf(df=walk_node_id,
                            Latitude="y",
                            Longitude="x",
                            projection="EPSG:4326")
walk_nodes_gdf = walk_nodes_gdf.to_crs(epsg=2958)  

# Preparing Terrain Data - Government of Canada
Need to pull in the terrain data so that I can add them to the road network nodes and then to the edges as weights

In [None]:
import gdal # dealing with raster data

filepath_TSC = r"C:/Users/jodyn/Google Drive/Insight/Terrain/TSC_2958.tif"
filepath_Hillshade = r"C:/Users/jodyn/Google Drive/Insight/Terrain/Hillshade_2958.tif"
raster_TSC = gdal.Open(filepath_TSC)
raster_Hillshade = gdal.Open(filepath_Hillshade)

In [None]:
# I will use IDW to interpolate the probabilty values at each node

In [None]:
# need path in wgs
walk_nodes_gdf = walk_nodes_gdf.to_crs(epsg = 4326)
# saving as something else so that I can add to my main dataframe
walk_nodes_gdf_NB = nearest_neighbor_2(walk_nodes_gdf, robbery_model_nodes_4326,
                                             return_dist=True)
walk_nodes_gdf_NB_pd = nearest_neighbor_2(walk_nodes_gdf, pedestrian_model_nodes_4326,
                                             return_dist=True)
# adding index for future join to full node dataset
walk_nodes_gdf_NB.set_index(walk_nodes_gdf.index,inplace = True)
walk_nodes_gdf_NB_pd.set_index(walk_nodes_gdf.index,inplace = True)
# also adding model outputs at each nodes for edge weights later
walk_nodes_gdf_NB['TSC'] = list(walk_nodes_gdf['TSC'])
walk_nodes_gdf_NB['Hillshade'] = list(walk_nodes_gdf['Hillshade'])
walk_nodes_gdf_NB['Collison'] = list(walk_nodes_gdf_NB_pd['Collison'])
walk_nodes_gdf_NB['No_Collison'] = list(walk_nodes_gdf_NB_pd['No_Collison'])

# Exporting Node & Edge Data for Shortest Path in Application

In [None]:
import pickle
walk_nodes_gdf_NB.to_pickle("data/cleaned/walknodes.pkl")
walk_edges_proj.to_pickle("data/cleaned/edges.pkl")
walk_nodes_proj.to_pickle("data/cleaned/nodes.pkl")
with open("data/cleaned/path.p", 'wb') as f:
    pickle.dump(walk_path_GTA_proj,f)
