## Linear & Logistic Regression

In [None]:
#import packages

import math
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
from mpl_toolkits import mplot3d
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# load data

df = pd.read_csv("cleanedData.csv")
df

In [None]:
encoded_df = df.copy()

#creating datetime objects from the date and time cols
date = encoded_df['Date']
time = encoded_df['Time']
date_time = date + ' ' + time
dates_formatted = pd.to_datetime(date_time)
loc = encoded_df.columns.get_loc("Date")
encoded_df.insert(loc, "Date_Time",date_time)
encoded_df['Date_Time'] = dates_formatted.map(dt.datetime.toordinal)

#encode categorical data
categories = {'IUCR', 'Location Description', 'Arrest', 'FBI Code', 'PRIMARY DESCRIPTION', 'SECONDARY DESCRIPTION'}
#dictionary of encoded value to original value
encoded_map = {}
for i in categories:
    #label_encoder object knows how to understand word labels. 
    label_encoder = preprocessing.LabelEncoder() 
    temp = encoded_df[i]
    temp = label_encoder.fit_transform(temp) 
    loc = encoded_df.columns.get_loc(i)
    encoded_df.insert(loc, i + '_encoded',temp)
    dictionary = dict(zip(encoded_df[i + '_encoded'], encoded_df[i]))
    encoded_map[i] = dictionary

encoded_df

In [None]:
encoded_df.nunique()


### Linear Regression

In [None]:
#linear regression model for IUCR, using Location and Time
y_data = encoded_df['IUCR_encoded']
x_data = encoded_df[['Location Description_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual IUCR code".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for FBI Code, using Location and Time
y_data = encoded_df['FBI Code_encoded']
x_data = encoded_df[['Location Description_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual FBI code".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Primary Description, using Location and Time
y_data = encoded_df['PRIMARY DESCRIPTION_encoded']
x_data = encoded_df[['Location Description_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Primary Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Location Description_encoded']
x_data = encoded_df[['PRIMARY DESCRIPTION_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Location Description_encoded']
x_data = encoded_df[['FBI Code_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Location Description_encoded']
x_data = encoded_df[['IUCR_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

Linear Regression isn't really good for predicting what crimes will occur at a certain location and time, bc both the location and crime type is a categorical data, that when encoded puts an implication that the higher the number (code) the category is given the more important it is. But in reality all categories are equal in importance. 

### Logistic Regression

In [None]:
#logistic regression model for arrest rate, using all other data
x_data = encoded_df[['IUCR_encoded','Location Description_encoded','Date_Time','PRIMARY DESCRIPTION_encoded','SECONDARY DESCRIPTION_encoded']]
y_data = encoded_df['Arrest_encoded']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, logisticRegr.coef_[0][idx]))

score = logisticRegr.score(x_test, y_test)
predictions = logisticRegr.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".2f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);


In [None]:
#logistic regression model for FBI Code, using Location and Time
x_data = encoded_df[['Location Description_encoded','Date_Time']]
y_data = encoded_df['FBI Code_encoded']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, logisticRegr.coef_[0][idx]))

score = logisticRegr.score(x_test, y_test)
predictions = logisticRegr.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(20,20))
sns.heatmap(cm, fmt=".2f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

Logistic Regression is better for categorical data modelling, but since the crimy types have too many categries it is still a very inaccurate model. 

In [None]:
#categorizing location into 4 categories:  1. Residential Area, 2.Public Open Space, 3. Non-Resid Buildings 4. Transporatation, 0.OTHER
loca  = encoded_df['Location Description']
encoded_df.loc[:, 'Loca Num'] = np.where( loca.str.contains('RESIDEN')
                                    |loca.str.contains('APARTMENT'), 1,
                                    np.where(loca.str.contains('STREET')
                                             |loca.str.contains('SIDEWALK')
                                             |loca.str.contains('STATION')
                                             |loca.str.contains('LOT')
                                             |loca.str.contains('ALLEY')
                                             |loca.str.contains('PLATFORM')
                                             |loca.str.contains('HIGHWAY')
                                             |loca.str.contains('BRIDGE')
                                             |loca.str.contains('LAKEFRONT')
                                             |loca.str.contains('FOREST')
                                             |loca.str.contains('LAKE')
                                             |loca.str.contains('RIVER')
                                             |loca.str.contains('PARK'), 2,
                                             np.where(loca.str.contains('BUILDING')
                                                      |loca.str.contains('STORE')
                                                      |loca.str.contains('RESTAURANT')
                                                      |loca.str.contains('SCHOOL')
                                                      |loca.str.contains('OFFICE')
                                                      |loca.str.contains('HOTEL')
                                                      |loca.str.contains('BANK')
                                                      |loca.str.contains('CONSTRUCTION SITE')
                                                      |loca.str.contains('CLUB')
                                                      |loca.str.contains('BARBERSHOP')
                                                      |loca.str.contains('COLLEGE')
                                                      |loca.str.contains('LIBRARY')
                                                      |loca.str.contains('ATM')
                                                      |loca.str.contains('CENTER')
                                                      |loca.str.contains('THEATER')
                                                      |loca.str.contains('CHURCH')
                                                      |loca.str.contains('STADIUM')
                                                      |loca.str.contains('FACILITY')
                                                      |loca.str.contains('SHOP')
                                                      |loca.str.contains('HOSPITAL')
                                                      |loca.str.contains('CREDIT UNION')
                                                      |loca.str.contains('MOTEL')
                                                      |loca.str.contains('AIRPORT')
                                                      |loca.str.contains('FACTORY')
                                                      |loca.str.contains('ROOM')
                                                      |loca.str.contains('BAR'), 3,
                                                      np.where( loca.str.contains('TRANSPORTATION')
                                                                |loca.str.contains('TRUCK')
                                                                |loca.str.contains('BUS')
                                                                |loca.str.contains('TAXICAB')
                                                                |loca.str.contains('VEHICLE')
                                                                |loca.str.contains('TRAIN'), 4,
                                                                0))))

### Linear and Logistic Regression w/ Loca Num

In [None]:
#linear regression model for IUCR, using Location and Time
y_data = encoded_df['IUCR_encoded']
x_data = encoded_df[['Loca Num','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual IUCR code".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for FBI Code, using Location and Time
y_data = encoded_df['FBI Code_encoded']
x_data = encoded_df[['Loca Num','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual FBI code".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Primary Description, using Location and Time
y_data = encoded_df['PRIMARY DESCRIPTION_encoded']
x_data = encoded_df[['Loca Num','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Primary Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Loca Num']
x_data = encoded_df[['PRIMARY DESCRIPTION_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Loca Num']
x_data = encoded_df[['FBI Code_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#linear regression model for Location, using Crime and Time
y_data = encoded_df['Loca Num']
x_data = encoded_df[['IUCR_encoded','Date_Time']]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))

score = regression_model.score(x_test, y_test) * 100
print("{}% of the variability in Y can be explained using X".format(score))

y_predict = regression_model.predict(x_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse = math.sqrt(regression_model_mse)
print("On average we are {} away from the actual encoded Location Description".format(regression_model_mse))

#print("Example: ")
#regression_model.predict([[113,732698]])

In [None]:
#logistic regression model for arrest rate, using all other data
x_data = encoded_df[['IUCR_encoded','Loca Num','Date_Time','PRIMARY DESCRIPTION_encoded','SECONDARY DESCRIPTION_encoded']]
y_data = encoded_df['Arrest_encoded']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, logisticRegr.coef_[0][idx]))

score = logisticRegr.score(x_test, y_test)
predictions = logisticRegr.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".2f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
#logistic regression model for FBI Code, using Location and Time
x_data = encoded_df[['Loca Num','Date_Time']]
y_data = encoded_df['FBI Code_encoded']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, logisticRegr.coef_[0][idx]))

score = logisticRegr.score(x_test, y_test)
predictions = logisticRegr.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(20,20))
sns.heatmap(cm, fmt=".2f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
#logistic regression model for Location, using other data
x_data = encoded_df[['IUCR_encoded','Arrest_encoded','Date_Time','PRIMARY DESCRIPTION_encoded','SECONDARY DESCRIPTION_encoded']]
y_data = encoded_df['Loca Num']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, logisticRegr.coef_[0][idx]))

score = logisticRegr.score(x_test, y_test)
predictions = logisticRegr.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(20,20))
sns.heatmap(cm, fmt=".2f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);