# Time of Day Prediction

In [7]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

#sklearn imports:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Evaluation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import scikitplot as skplt
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Import the dataset
df = pd.read_csv('boston-crime.csv')

In [9]:
# Reference: https://www.kaggle.com/getting-started/27270
# Numerically encoding features
from sklearn.preprocessing import LabelEncoder

df['Offense Type'] = LabelEncoder().fit_transform(df['Offense Type'])

df['Offense Description'] = LabelEncoder().fit_transform(df['Offense Description'])

df['District'] = LabelEncoder().fit_transform(df['District'].astype(str))
df['District'] = LabelEncoder().fit_transform(df['District'])

df['Reporting Area'] = LabelEncoder().fit_transform(df['Reporting Area'].astype(str))

df['Day of Week'] = LabelEncoder().fit_transform(df['Day of Week'])

df['UCR Offense Level'] = LabelEncoder().fit_transform(df['UCR Offense Level'].astype(str))
df['UCR Offense Level'] = LabelEncoder().fit_transform(df['UCR Offense Level'])

df['Street'] = LabelEncoder().fit_transform(df['Street'].astype(str))
df['Street'] = LabelEncoder().fit_transform(df['Street'])
df.head()

Unnamed: 0.1,Unnamed: 0,Incident Number,Offense Type,Offense Description,District,Reporting Area,Shooting,Date,Year,Month,Day of Week,Hour,UCR Offense Level,Street,Latitude,Longitude,Coordinates
0,0,I192060182,43,155,8,120,No,2019-08-02 21:59:00,2019,8,0,21,2,4060,42.352175,-71.049134,"(42.35217524, -71.04913425)"
1,1,I192060181,61,20,9,819,No,2019-08-02 21:15:00,2019,8,0,21,3,1434,42.338696,-71.071399,"(42.33869635, -71.07139879)"
2,2,I192060181,46,121,9,819,No,2019-08-02 21:15:00,2019,8,0,21,2,1434,42.338696,-71.071399,"(42.33869635, -71.07139879)"
3,3,I192060180,61,20,6,382,No,2019-08-02 20:44:00,2019,8,0,20,3,4139,42.295977,-71.07934,"(42.29597658, -71.07933990)"
4,4,I192060179,63,230,0,765,No,2019-08-02 20:51:00,2019,8,0,20,3,398,42.336267,-71.149503,"(42.33626664, -71.14950271)"


In [10]:
def evaluate_features(X, y, key=0):
    """
    General helper function for evaluating effectiveness of passed features in ML model

    Prints out Log loss, accuracy, and confusion matrix with 3-fold stratified cross-validation

    Parameters
    ----------
    X : Features array

    y : Labels array

    key: 0 = DecisionTreeClassifier (Default)
         1 = ExtraTreeClassifier
         2 = RandomForestClassifier
         3 = KNeighborsClassifier
         4 = GaussianNB
    """

    clf = [DecisionTreeClassifier(),
           ExtraTreeClassifier(),
           RandomForestClassifier(),
           KNeighborsClassifier(),
           GaussianNB()]
    
    clf_names = ['DecisionTreeClassifier',
                 'ExtraTreeClassifier',
                 'RandomForestClassifier', 
                 'KNeighborsClassifier',
                 'GaussianNB']

    probabilities = cross_val_predict(clf[key], X, y, cv=StratifiedKFold(n_splits=2, random_state=8),
                                      n_jobs=-1, method='predict_proba', verbose=2)
    predicted_indices = np.argmax(probabilities, axis=1)
    classes = np.unique(y)
    predicted = classes[predicted_indices]
    print('Log loss: {}'.format(log_loss(y, probabilities)))
    print('Accuracy: {}'.format(accuracy_score(y, predicted)))
    title = "Confusion Matrix for " + labels.name + " with " + clf_names[key]
    skplt.metrics.plot_confusion_matrix(y, predicted, normalize=True, figsize=(20,10), title=title)
    
    print(classification_report(y, predicted))
    plt.show()

In [11]:
df_for_model = df[['Offense Description', 'Day of Week',
                   'District', 'Reporting Area', 'Month',
                   'Year', 'Shooting', 'Offense Type',
                   'Street', 'UCR Offense Level']]
labels = df['Is Dark']

KeyError: 'Is Dark'

## Decision Tree Classifier

In [None]:
evaluate_features(df_for_model, labels)

## Extra Tree Classifier

In [None]:
evaluate_features(df_for_model, labels, 1)

## Random Forest Classifier

In [None]:
evaluate_features(df_for_model, labels, 2)

## K-Nearest Neighbor Classifier

In [None]:
evaluate_features(df_for_model, labels, 3)

## Gaussian NB Classifier

In [None]:
evaluate_features(df_for_model, labels, 4)