In [1]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support as score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
import pickle

In [2]:
incidents1 = pd.read_csv('incidentData1.csv')
incidents2 = pd.read_csv('incidentData2.csv')

In [3]:
incidents1.shape, incidents2.shape

((136472, 36), (136173, 36))

In [4]:
# add column names to incidents2 file
incidents2.columns = incidents1.columns

In [5]:
# Stack the DataFrames on top of each other
incidents = pd.concat([incidents1, incidents2], axis=0)

In [6]:
incidents.head()

Unnamed: 0,(empty,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,...,point,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods,HSOC Zones as of 2018-06-05,OWED Public Spaces,Central Market/Tenderloin Boundary Polygon - Updated,Parks Alliance CPSI (27+TL sites),ESNCAG - Boundary File)
0,(00000,2019/08/15 11:41:00 AM,2019/08/15,11:41,2019,Thursday,2019/10/01 02:06:00 PM,85424006374,854240,196208089,...,,,,,,,,,,)
1,(00000,2019/09/17 10:00:00 PM,2019/09/17,22:00,2019,Tuesday,2019/10/02 10:01:00 PM,85426606374,854266,196208205,...,,,,,,,,,,)
2,(94121,2019/10/04 02:25:00 PM,2019/10/04,14:25,2019,Friday,2019/10/04 04:13:00 PM,85442603474,854426,190746203,...,POINT (-122.51129492624534 37.77507596005672),8.0,8.0,4.0,29.0,,,,,)
3,(94114,2019/10/03 07:30:00 PM,2019/10/03,19:30,2019,Thursday,2019/10/03 11:25:00 PM,85419706244,854197,190744514,...,POINT (-122.42746205880601 37.76877049785351),28.0,3.0,5.0,5.0,5.0,,,,)
4,(94121,2019/10/04 04:53:00 PM,2019/10/04,16:53,2019,Friday,2019/10/04 04:53:00 PM,85446351040,854463,190746532,...,POINT (-122.5030864538133 37.781176766186576),6.0,8.0,4.0,29.0,,,,,)


In [7]:
incidents.shape

(272645, 36)

In [8]:
extracted_df = incidents[['(empty', 'Incident Date', 'Incident Time', 'Incident Year', 'Incident Day of Week','Incident Category']]
extracted_df.head()

Unnamed: 0,(empty,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Category
0,(00000,2019/08/15,11:41,2019,Thursday,Larceny Theft
1,(00000,2019/09/17,22:00,2019,Tuesday,Larceny Theft
2,(94121,2019/10/04,14:25,2019,Friday,Robbery
3,(94114,2019/10/03,19:30,2019,Thursday,Larceny Theft
4,(94121,2019/10/04,16:53,2019,Friday,Non-Criminal


In [9]:
extracted_df.shape

(272645, 6)

In [10]:
extract_month = lambda x : x[5:7]
extract_date = lambda x : x[8:10]
extract_hour = lambda x : x[0:2]
extract_minute = lambda x : x[3:5]

In [11]:
extracted_df['month'] = extracted_df['Incident Date'].apply(extract_month)
extracted_df['date'] = extracted_df['Incident Date'].apply(extract_date)
extracted_df['hour'] = extracted_df['Incident Time'].apply(extract_hour)
extracted_df['minute'] = extracted_df['Incident Time'].apply(extract_minute)

In [12]:
# remove brace from zipcode
remove_bracket = lambda y : y[1:]
extracted_df['zipcode'] = extracted_df['(empty'].apply(remove_bracket)

In [13]:
def weighIncident(incidentType):
    if incidentType=="Non-Criminal" or incidentType=="Other" or incidentType=="Other Miscellaneous" or incidentType=="Case Closure":
        return 1
    elif incidentType=="Miscellaneous Investigation" or incidentType=="Fraud" or incidentType=="Forgery And Counterfeiting" or incidentType=="Warrant" or incidentType=="Traffic Violation Arrest" or incidentType=="Gambling" or incidentType=="Civil Sidewalks" or incidentType=="Courtesy Report":
        return 2
    elif incidentType=="Juvenile Offenses" or incidentType=="Lost Property" or incidentType=="Suspicious Occ" or incidentType=="Suspicious" or incidentType=="Vandalism" or incidentType=="Recovered Vehicle":
        return 3
    elif incidentType=="Vehicle Misplaced" or incidentType=="Vehicle Impounded":
        return 5
    elif incidentType=="Disorderly Conduct" or incidentType=="Traffic Collision" or incidentType=="Fire Report" or incidentType=="Weapons Carrying Etc":
        return 8
    elif incidentType=="Liquor Laws" or incidentType=="Drug Offense" or incidentType=="Drug Violation" or incidentType=="Embezzlement":
        return 10
    elif incidentType=="Motor Vehicle Theft" or incidentType=="Stolen Property" or incidentType=="Robbery" or incidentType=="Motor Vehicle Theft?" or incidentType=="Larceny Theft" or incidentType=="Burglary" or incidentType=="Malicious Mischief":
        return 14
    elif incidentType=="Prostitution" or incidentType=="Other Offenses":
        return 15
    elif incidentType=="Arson" or incidentType=="Offences Against The Family And Children" or incidentType=="Family Offense" or incidentType=="Missing Person" or incidentType=="Weapons Offense" or incidentType=="Weapons Offence":
        return 18
    elif incidentType=="Suicide" or incidentType=="Rape" or incidentType=="Assault" or incidentType=="Sex Offense" or incidentType=="Homicide" or incidentType=="Human Trafficking (A), Commercial Sex Acts" or incidentType=="Human Trafficking, Commercial Sex Acts":
        return 20
    else:
        return 0


In [14]:
extracted_df['weights'] = extracted_df['Incident Category'].apply(weighIncident)

In [15]:
extracted_df = extracted_df.drop(columns=['(empty','Incident Date', 'Incident Time','Incident Category'])

In [16]:
extracted_df.dropna(inplace=True)

In [17]:
extracted_df.drop(extracted_df[extracted_df.zipcode == '00000'].index, inplace=True)
extracted_df.drop(extracted_df[extracted_df.weights == 0].index, inplace=True)

In [18]:
extracted_df.head()

Unnamed: 0,Incident Year,Incident Day of Week,month,date,hour,minute,zipcode,weights
2,2019,Friday,10,4,14,25,94121,14
3,2019,Thursday,10,3,19,30,94114,14
4,2019,Friday,10,4,16,53,94121,1
5,2019,Wednesday,10,2,14,10,94133,14
6,2019,Thursday,10,3,23,30,94133,18


In [19]:
oneHotEncodedDF = pd.get_dummies(extracted_df, prefix="", prefix_sep="", columns=['Incident Day of Week'])
oneHotEncodedDF.head()

Unnamed: 0,Incident Year,month,date,hour,minute,zipcode,weights,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
2,2019,10,4,14,25,94121,14,1,0,0,0,0,0,0
3,2019,10,3,19,30,94114,14,0,0,0,0,1,0,0
4,2019,10,4,16,53,94121,1,1,0,0,0,0,0,0
5,2019,10,2,14,10,94133,14,0,0,0,0,0,0,1
6,2019,10,3,23,30,94133,18,0,0,0,0,1,0,0


In [20]:
oneHotEncodedDF.shape

(241816, 14)

In [21]:
def build_model():
    rf_estimator = RandomForestClassifier()
    # specify parameters for grid search
    parameters = { 
    'n_estimators': [5, 10, 20, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [5,10],
    'min_samples_split' : [2,5,10],
    'max_leaf_nodes' : [50, 60, 70]
    }

    # create grid search object
    cv = GridSearchCV(rf_estimator, param_grid=parameters)
    return cv

In [22]:
def features_and_label():
    y = oneHotEncodedDF['weights']
    X = oneHotEncodedDF.drop(columns=['weights'])
    return X, y

def display_results(cv, y_test, y_pred):
    l1 = list(np.unique(y_pred))
    l2 = list(np.unique(y_test))
    labels = list(set(l1) | set(l2)) 
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test, y_pred)

    print("Labels:", labels, "\n")
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion_mat)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(fscore))
    print('support: {}'.format(support))
    print()
    print("\nBest Parameters:", cv.best_params_)

X, y = features_and_label()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

model = build_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display_results(model, y_test, y_pred)

Labels: [1, 2, 3, 5, 8, 10, 14, 15, 18, 20] 

Accuracy: 0.4763693936386408
Confusion Matrix:
 [[   68     7     0     0     0     0 26280     0     2     0]
 [   39     8     0     0     0     0 17555     0     3     0]
 [   12     3     0     0     0     0 12806     0     0     0]
 [    0     0     0     0     0     0   203     0     0     0]
 [    4     1     0     0     0     0  4654     0     0     0]
 [   36     0     0     0     0     0  4858     0     4     0]
 [   64     6     0     0     0     0 80560     0     0     0]
 [    6     0     0     0     0     0  1691     0     0     0]
 [   11     2     0     0     0     0  9121     0     0     0]
 [   26     0     0     0     0     0 11242     0     0     0]]
precision: [0.2556391  0.2962963  0.         0.         0.         0.
 0.47677102 0.         0.         0.        ]
recall: [2.57995978e-03 4.54416359e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 9.99131837e-01 0.00000000e+00
 0.00000000e+00 0.00000000e+0

In [23]:
filename = 'final_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [25]:
def preprocessData(testDF):
    
    daysOfWeek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    extract_month = lambda x : x[5:7]
    extract_date = lambda x : x[8:10]
    extract_hour = lambda x : x[0:2]
    extract_minute = lambda x : x[3:5]
    
    testDF['month'] = testDF['Incident Date'].apply(extract_month)
    testDF['date'] = testDF['Incident Date'].apply(extract_date)
    testDF['hour'] = testDF['Incident Time'].apply(extract_hour)
    testDF['minute'] = testDF['Incident Time'].apply(extract_minute)
    
    testDF = testDF.drop(columns=['Incident Date', 'Incident Time'])
    testDF.dropna(inplace=True)
    testDF.drop(testDF[testDF.zipcode == '00000'].index, inplace=True)
    
    testOneHotEncodedDF = pd.get_dummies(testDF, prefix="", prefix_sep="", columns=['Incident Day of Week'])
    
    for day in daysOfWeek:
        if not day in list(testOneHotEncodedDF.columns):
            testOneHotEncodedDF[day] = 0
            
    return testOneHotEncodedDF

In [26]:
toBePredictedDF = pd.DataFrame([['2019', '2019/12/15', '12:00', '94121', 'Monday']])
toBePredictedDF.columns = ['Incident Year', 'Incident Date', 'Incident Time', 'zipcode', 'Incident Day of Week']

In [27]:
preprocessedDF = preprocessData(toBePredictedDF)

In [28]:
preprocessedDF.head()

Unnamed: 0,Incident Year,zipcode,month,date,hour,minute,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2019,94121,12,15,12,0,1,0,0,0,0,0,0


In [29]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
result = loaded_model.predict(preprocessedDF)
print(result)

[14]
