### FINAL MODELS

- codes only 
- report uses results from:
    - GradientBoosting and Neural Network.ipynb
    - Random Forest Regression.ipynb

In [None]:
# import packages 

import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn import ensemble, metrics

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

# time
import time

In [None]:
# load dataset 

df = pd.read_csv("cleanedData.csv")
df.head(10)

In [None]:
# run Random Forest Regression 
df.dtypes
hours = pd.to_datetime(df['Time']).dt.hour
hours.head(10)


In [None]:
loca  = df['Location Description']
loca.unique().tolist()

In [None]:
newDf = df.copy()

# its hard to build a model that predicts the type of a crime location
# in 157 categories, therefore, we separate these location types into fewer categories 
# 1. Residential Area, 2.Public Open Space, 3. Non-Resid Buildings 4. Transporatation, 0.OTHER
newDf.loc[:, 'Loca Num'] = np.where( loca.str.contains('RESIDEN')
                                    |loca.str.contains('APARTMENT'), 1,
                                    np.where(loca.str.contains('STREET')
                                             |loca.str.contains('SIDEWALK')
                                             |loca.str.contains('STATION')
                                             |loca.str.contains('LOT')
                                             |loca.str.contains('ALLEY')
                                             |loca.str.contains('PLATFORM')
                                             |loca.str.contains('HIGHWAY')
                                             |loca.str.contains('BRIDGE')
                                             |loca.str.contains('LAKEFRONT')
                                             |loca.str.contains('FOREST')
                                             |loca.str.contains('LAKE')
                                             |loca.str.contains('RIVER')
                                             |loca.str.contains('PARK'), 2,
                                             np.where(loca.str.contains('BUILDING')
                                                      |loca.str.contains('STORE')
                                                      |loca.str.contains('RESTAURANT')
                                                      |loca.str.contains('SCHOOL')
                                                      |loca.str.contains('OFFICE')
                                                      |loca.str.contains('HOTEL')
                                                      |loca.str.contains('BANK')
                                                      |loca.str.contains('CONSTRUCTION SITE')
                                                      |loca.str.contains('CLUB')
                                                      |loca.str.contains('BARBERSHOP')
                                                      |loca.str.contains('COLLEGE')
                                                      |loca.str.contains('LIBRARY')
                                                      |loca.str.contains('ATM')
                                                      |loca.str.contains('CENTER')
                                                      |loca.str.contains('THEATER')
                                                      |loca.str.contains('CHURCH')
                                                      |loca.str.contains('STADIUM')
                                                      |loca.str.contains('FACILITY')
                                                      |loca.str.contains('SHOP')
                                                      |loca.str.contains('HOSPITAL')
                                                      |loca.str.contains('CREDIT UNION')
                                                      |loca.str.contains('MOTEL')
                                                      |loca.str.contains('AIRPORT')
                                                      |loca.str.contains('FACTORY')
                                                      |loca.str.contains('ROOM')
                                                      |loca.str.contains('BAR'), 3,
                                                      np.where( loca.str.contains('TRANSPORTATION')
                                                                |loca.str.contains('TRUCK')
                                                                |loca.str.contains('BUS')
                                                                |loca.str.contains('TAXICAB')
                                                                |loca.str.contains('VEHICLE')
                                                                |loca.str.contains('TRAIN'), 4,
                                                                0))))
                                                               

In [None]:
# Label Encoding categorical data 
newDf.drop(['ID', 'Time', 'Location Description'], axis = 1, inplace = True)
newDf['Hours'] = hours

# all categorical data
categories = ['IUCR','Arrest',
           'FBI Code', 'Domestic', 'Month', 'DayofWeek',
           'PRIMARY DESCRIPTION', 'SECONDARY DESCRIPTION', 'INDEX CODE', 'Hours']
encoders = {}

for i in categories:
    # Create a label (category) encoder object
    encoders[i] = preprocessing.LabelEncoder()

    # Fit the encoder to the pandas column
    encoders[i].fit(newDf[i])
    
    # View the labels (if you want)
    #list(le.classes_)
    
    # Apply the fitted encoder to the pandas column
    trans = encoders[i].transform(newDf[i]) 
    
    # Append to DataFrame
    newName = i + '_encoded'
    newDf.loc[:, newName] = pd.Series(trans, index = newDf.index)

newDf.describe()

In [None]:
def plot_correlation_map(df):
    """
    This function plots the correlation map of a given dataframe
    """
    corr = df.corr()
    _ , ax = plt.subplots( figsize =(24, 20 ) )
    cmap = sns.diverging_palette(220, 10, as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )


plot_correlation_map(newDf)
newDf.corr()

In [None]:
# MODEL FEATURES: TIME, LOCATION, determine TYPE 
location = newDf['Location Description_encoded']
train1 = newDf[['Month_encoded', 'DayofWeek_encoded',
           'PRIMARY DESCRIPTION_encoded', 'Hours', 'District']]


# split our data into train and test
x_train , x_test , y_train , y_test = train_test_split(train1 , location , test_size = 0.30,random_state =2)

### Random Forest

In [None]:
# build random forest model

# count time building the model 
time_start = time.clock()

trees = 35
depth = 15

rf = ensemble.RandomForestClassifier(n_estimators=trees, max_depth=depth)
rf.fit(x_train, y_train)

pred = rf.predict(x_test)
comparison = pd.crosstab(y_test, pred)

time1 = (time.clock() - time_start)
print("time to build a Random Forest Model is:", time1)
print("mean accuracy on the given test data and labels is:", rf.score(x_test,y_test))

In [None]:
# displays the precision, recall, F1, and support scores for the model

# Precision is the ability of a classiifer not to label an instance positive that is actually negative. 
# for all instances classified positive, what percent was correct

# Recall is the ability of a classifier to find all positive instances.
# for all instances that were actually positive, what percent was classified correctly

# The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0

# Support is the number of actual occurrences of the class in the specified dataset. 

report =  metrics.classification_report(y_test, pred)
print(report)

In [None]:
# confusion matrix 
# https://towardsdatascience.com/demystifying-confusion-matrix-confusion-9e82201592fd

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=3)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=newDf['Location Description_encoded'].unique().tolist(),
                      title='Confusion matrix, without normalization')
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3], normalize=True,
                      title='Normalized confusion matrix')
plt.show()

### Gradient Boosting 

In [None]:
# The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting 
# so a large number usually results in better performance.

# count time building the model 
time_start = time.clock()

gbr3 = ensemble.GradientBoostingRegressor(n_estimators = 50, max_depth = 5,
                                         loss = 'ls')

gbr3.fit(x_train, y_train)

gbr_pred3 = gbr3.predict(x_test)
gbr_comparison3 = pd.crosstab(y_test, gbr_pred3)

grd_time3 = (time.clock() - time_start)
print("time to build a Gradient Boosting Model is:", grd_time3)
print("Test Accuracy: ", metrics.accuracy_score(y_test, gbr_pred3.round()))

In [None]:
report =  metrics.classification_report(y_test, gbr_pred3.round())
print(report)

In [None]:
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, gbr_pred3.round())
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4],
                      title='Confusion matrix, without normalization')
plt.savefig('gbr_cm_wo_normal.png')
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4], normalize=True,
                      title='Normalized confusion matrix')
plt.savefig('gbr_cm_w_normal.png')
plt.show()

### Deep Neural Network

In [None]:
y_train2 = to_categorical(y_train, num_classes=5)
y_test2 = to_categorical(y_test, num_classes=5)

In [None]:
# https://towardsdatascience.com/deep-learning-for-beginners-practical-guide-with-python-and-keras-d295bfca4487

# count time building the model 
time_start = time.clock()

nn2 = Sequential()
nn2.add(Dense(128, input_dim=5, activation='relu'))
nn2.add(Dense(128, activation='relu'))
nn2.add(Dense(128, activation='relu'))
nn2.add(Dense(128, activation='relu'))
nn2.add(Dense(5, activation='softmax'))
nn2.compile(loss='categorical_crossentropy',optimizer='sgd', metrics=['accuracy'])

nn2.fit(x_train, y_train2, batch_size=128, epochs=5)
nn_pred2 = nn2.predict_classes(x_test)

nn_time2 = (time.clock() - time_start)
print("time to build a Neural Network Model is:", nn_time2)
print("Test Accuracy: ", metrics.accuracy_score(y_test, nn_pred2))

In [None]:
report =  metrics.classification_report(y_test, nn_pred2)
print(report)

In [None]:
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, nn_pred2)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4],
                      title='Confusion matrix, without normalization')
plt.savefig('nn_cm_wo_normal.png')
plt.show()

In [None]:
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4], normalize=True,
                      title='Normalized confusion matrix')
plt.savefig('nn_cm_w_normal.png')
plt.show()