
# Machine Learning for VA: 911 Call Priority Classifier 

___

 Muhammad H. Sareini - mhs3vh@virginia.edu
 
Javier Rosas Ruiz - jr2dj@virginia.edu
 
Sammy R. Hecht - srh2kq@virginia.edu
 
 ---


In [None]:
# Some pagkages to import

from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix # optional
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

## Load the Dataset


In [None]:
calls = pd.read_csv("~/ml/Police_Calls_for_Service.csv")

In [None]:
calls.head() # look at the data 

## Data Pre-Processing 


### Use Only Relevant Data

In [None]:
## Drop unnecessary data columns we will not use 

columns_to_drop = ['Incident Number',
                   'Report Number', 
                   'Subdivision', 
                   'Entry Date/Time', 
                   'Dispatch Date/Time', 
                   'En Route Date/Time', 
                   'On Scene Date/Time', 
                   'Close Date/Time']

calls = calls.drop(columns_to_drop, axis=1)

# only takes locations that have coordinates in them 
calls = calls[calls["Location"].str.count("\n") == 2]

# get rid of null values that will ruin pipeline in Case Disposition
calls = calls.dropna(subset=["Case Disposition"]) 

# Fix error where half of calls is ints, other half is str
calls = calls.astype( {"Zone": str})

In [None]:
calls.head()

### Break Up Attributes

In [None]:
### an attrib addr pipeline only adds attributes, but I need 
### to break current attributes up so I am doing it manually 

# Break date/time into month, year, hour and convert to millitary time 

dt = calls["Call Date/Time"]

month = dt.str[:2]

year = dt.str[6:10]

def convert_millitary_time(dt):
  """
  takes in datetime and returns the hour converted into millitary time 
  """
  
  hr = dt[11:13]
  am_pm = dt[-2:]
  num_hr = int(hr)
  
  if am_pm == "PM":
    return str(num_hr + 12)
  else:
    return hr 
  
hr = dt.apply(convert_millitary_time) 

# Get Lat and Long 
def get_lat(loc):
  """
  takes location string and returns lat and long 
  """
  tup = loc.split("\n")[-1]
  tup = tup.replace("(","").replace(")","")
  lat,long = tup.split(",")
  return float(lat)

def get_long(loc):
  """
  takes location string and returns lat and long 
  """
  tup = loc.split("\n")[-1]
  tup = tup.replace("(","").replace(")","")
  lat,long = tup.split(",")
  return float(long)

lat = calls['Location'].apply(get_lat)
long = calls['Location'].apply(get_long)




In [None]:
import matplotlib.pyplot as plt

from scipy.misc import imread
import matplotlib.cbook as cbook


# url for image is: https://cdn.theatlantic.com/media/old_wire/img/upload/2012/04/06/mayfairmews.04062012.png
datafile = cbook.get_sample_data("c:\\users\\sammy hecht\\ml\\va-beach.png")
img = imread(datafile)
plt.scatter(lat[:100], long[:100], color="red")
plt.imshow(img, zorder=0, extent=[min(lat[:100]), max(lat[:100]), min(long[:100]), max(long[:100])])
plt.show()

In [None]:
## Have a look at the new table 
calls['Month'] = month
# calls['Year'] = year
calls['Hour'] = hr
calls['Lat'] = lat
calls['Long'] = long
calls = calls.drop(["Call Date/Time", "Location"], axis=1) # drop the unnecessary columns now 

## See how the data looks now 
calls.head()

In [None]:
corr_matrix = calls.corr() 
corr_matrix["Priority"].sort_values(ascending=False)

In [None]:
### Split Data into training and test sets 


train_set,test_set = train_test_split(calls, test_size=0.2, random_state=42)

y_train = train_set['Priority']
X_train = train_set.drop('Priority', axis=1)

y_test = test_set['Priority']
X_test = test_set.drop('Priority', axis=1)

In [None]:
### Pipeline 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

num_attribs = ["Lat", "Long"]
cat_attribs = ["Call Type", "Zone", "Case Disposition", "Month", "Hour"]

# cats is the categories we will encode
cats = [calls['Call Type'].unique(),calls['Zone'].unique(),calls['Case Disposition'].unique(),calls['Month'].unique(),calls['Hour'].unique()]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(categories=cats), cat_attribs),
])



X_test = full_pipeline.fit_transform(X_test)
X_train = full_pipeline.fit_transform(X_train)

## Model Training


### Logistic Regression

In [None]:
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score
# Logistic Regression
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42, C=1)
log_reg.fit(X_train, y_train)

acc_sc = accuracy_score(log_reg.predict(X_test), y_test)
prec_sc = precision_score(log_reg.predict(X_test), y_test, average="macro")
f1_sc = f1_score(log_reg.predict(X_test), y_test, average="macro")
rec_sc = recall_score(log_reg.predict(X_test), y_test, average="macro")

print(acc_sc, prec_sc, f1_sc, rec_sc)

In [None]:
log_reg.predict(X_test[:100])

0.9699916666666667 

0.6693597069097817 

0.7015597844828797 

0.9693108007538811


### SVM

In [None]:
from sklearn.svm import LinearSVC

# Training your svm here
svm_clf = LinearSVC(C=1, loss="hinge", random_state=42, max_iter=10)
svm_clf.fit(X_train, y_train.ravel())

# Testing your svm here
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


y_train_predict = cross_val_predict(svm_clf, X_train, y_train, cv=3)


# 1) Accuracy: 
print("Accuracy Score: ", np.average(cross_val_score(svm_clf, X_train, y_train, cv=3, scoring="accuracy")))


# 2) Precision: 
print("Precision Score: ", precision_score(y_train, y_train_predict, average="macro"))


# 3) Recall: 
print("Recall Score: ", recall_score(y_train, y_train_predict, average="macro"))


# 4) F-1 Score: 
print("F-1 Score: ", f1_score(y_train, y_train_predict, average="macro"))




Accuracy Score:  0.9689936222111335

Precision Score:  0.920261552357195

Recall Score:  0.6628074524914764

F-1 Score:  0.6941962931810439

### Tuning SVM with the regularization hyperparameter C in order to avoid overfitting and get better accuracy.

In [None]:
# C = 1       
svm_clf1 = LinearSVC(C=1, loss="hinge", random_state=42, max_iter=10)
svm_clf1.fit(X_train, y_train.ravel())
score1 = np.average(cross_val_score(svm_clf1, X_train, y_train, cv=3, scoring="accuracy"))    
print("The accuracy when C = 1: ", score1)


# C = 10
svm_clf2 = LinearSVC(C=10, loss="hinge", random_state=42, max_iter=10)
svm_clf2.fit(X_train, y_train.ravel())
score2 = np.average(cross_val_score(svm_clf2, X_train, y_train, cv=3, scoring="accuracy"))  
print("The accuracy when C = 10: ", score2)


# C = 13 
svm_clf3 = LinearSVC(C=13, loss="hinge", random_state=42, max_iter=10)
svm_clf3.fit(X_train, y_train.ravel())
score3 = np.average(cross_val_score(svm_clf3, X_train, y_train, cv=3, scoring="accuracy"))  
print("The accuracy when C = 13: ", score3)


# C = 20 
svm_clf4 = LinearSVC(C=20, loss="hinge", random_state=42, max_iter=10)
svm_clf4.fit(X_train, y_train.ravel())
score4 = np.average(cross_val_score(svm_clf4, X_train, y_train, cv=3, scoring="accuracy"))  
print("The accuracy when C = 20: ", score4)


# C = 50
svm_clf5 = LinearSVC(C=50, loss="hinge", random_state=42, max_iter=10)
svm_clf5.fit(X_train, y_train.ravel())
score5 = np.average(cross_val_score(svm_clf5, X_train, y_train, cv=3, scoring="accuracy"))  
print("The accuracy when C = 50: ", score5)


In [None]:
print("Best value of C is clearly C = 1, with an accuracy of 0.9691505519240343")

### **DO NOT RUN THE CELL BELOW.** DUE TO THE SIZE OF OUR DATASET, IT TAKES ABOUT 1 HOUR TO RUN. The cell is Kernelizing the SVM with the Gaussian RBF. RESULTS OF THE GAUSSIAN KERNEL ARE WRITTEN BELOW the code cell. 


In [None]:
from sklearn.svm import SVC



rbf_kernel_svm_clf1 = SVC(kernel="rbf", gamma=0.1, C=1)
rbf_kernel_svm_clf1.fit(X_train, y_train)
y_train_predict1 = cross_val_predict(rbf_kernel_svm_clf1, X_train, y_train, cv=3)


# 1) Accuracy: 
print("Accuracy Score: ", np.average(cross_val_score(svm_clf, X_train, y_train, cv=3, scoring="accuracy")))


# 2) Precision: 
print("Precision Score: ", precision_score(y_train, y_train_predict1))


# 3) Recall: 
print("Recall Score: ", recall_score(y_train, y_train_predict1))


# 4) F-1 Score: 
print("F-1 Score: ", f1_score(y_train, y_train_predict1))


# Creating ROC curve: 
y_scores1 = cross_val_predict(rbf_kernel_svm_clf1, X_train, y_train, cv=3, method="decision_function")
fpr1, tpr1, thresholds1 = roc_curve(y_train, y_scores1)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label=None)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.show()

Accuracy Score: 0.92

Precision Score: 0.89

Recall Score: 0.71

F-1 Score: 0.60

### Decision Tree

Now, decision tree has no minimum depth. By increasing the minimum depth from 2,  the scores increased from the following:

>**Accuracy: 0.7848583333333333**

>**Precision: 0.5412268928390945**

>**Recall: 0.30713560243964166**

>**F1 Score: 0.3405258278733306**


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

d_tree = DecisionTreeClassifier(random_state=42, max_depth=70)
d_tree.fit(X_train, y_train)

y_pred_d_tree = d_tree.predict(X_test) # predictions 

## accuracy
acc_score = accuracy_score(y_test, y_pred_d_tree)
print("Accuracy:", acc_score)

## precision 
prec_score = precision_score(y_test, y_pred_d_tree, average='macro')
print("Precision:", prec_score)

## recall
rec_score = recall_score(y_test, y_pred_d_tree, average='macro')
print("Recall:", rec_score) 

## F1-Score
f1 = f1_score(y_test, y_pred_d_tree, average='macro')
print("F1 Score:", f1)



*Accuracy*: 0.9610583333333333

Precision: 0.7716911449224426

Recall: 0.7358102746749348

F1 Score: 0.7516427398957578

### Random Forest

Using random forest, precision went up over 10% with a small sacrifice to recall and F1

In [None]:
from sklearn.ensemble import RandomForestClassifier

r_forest = RandomForestClassifier(n_estimators=10, max_depth=70, random_state=42)
r_forest.fit(X_train, y_train)
y_pred_r_forest = r_forest.predict(X_test)

## accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_r_forest))

## precision 
print("Precision:",precision_score(y_test,y_pred_r_forest, average='macro'))

## recall
print("Recall:", recall_score(y_test, y_pred_r_forest, average='macro')) 

## F1-Score
print("F1 Score:", f1_score(y_test, y_pred_r_forest, average='macro'))

Accuracy: 0.9685416666666666

Precision: 0.8939483268758561

Recall: 0.6959151692412575

F1 Score: 0.7421529437200082

### Ensemble Learning

We will use an ensemble voting algorithm on each classifier we have already trained, to see if we can get a better performance. After playing with the parameters, it was found that after dropping random forest or decision tree classifier, the algorithm performs better in precision with only a small hit to F1 and recall resulting in very high accuracy and precision. In the end we decided to drop the random forest in favor of the decision tree classifier. 

In [None]:
from sklearn.ensemble import VotingClassifier

# use all previously named classifiers 
named_classifiers = [
    ('log_reg', log_reg),
    ('svm_clf', svm_clf),
    ('d_tree', d_tree),
]

# train the model 
voting_clf = VotingClassifier(named_classifiers)
voting_clf.fit(X_train,y_train)

# predict the model 
y_pred_ensemble = voting_clf.predict(X_test)

## accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_ensemble))

## precision 
print("Precision:",precision_score(y_test,y_pred_ensemble, average='macro'))

## recall
print("Recall:", recall_score(y_test, y_pred_ensemble, average='macro')) 

## F1-Score
print("F1 Score:", f1_score(y_test, y_pred_ensemble, average='macro'))

Accuracy: 0.969925

Precision: 0.9703112433006822

Recall: 0.6685719583226668

F1 Score: 0.7009708502651231

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# train the boosting model
grad_boost = GradientBoostingClassifier(loss="deviance", learning_rate=.2, random_state=42, n_estimators=50)

grad_boost.fit(X_train, y_train)

# predict the model
y_pred_boost = grad_boost.predict(X_test)


## accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_boost))

## precision 
print("Precision:",precision_score(y_test,y_pred_boost, average='macro'))

## recall
print("Recall:", recall_score(y_test, y_pred_boost, average='macro')) 

## F1-Score
print("F1 Score:", f1_score(y_test, y_pred_boost, average='macro'))

The GradientBoostingClassifier performed well in terms of accuracy, recall, and F1 score, but suffered slightly with precision.  With a learning rate of .1, and 100 boosting stages, the metrics were as follows:

>Accuracy: 0.9648581844635313

>Precision: 0.8866753181067324

>Recall: 0.7310775582409732

>F1 Score: 0.7759839794883954