In [12]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

#sklearn imports:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Evaluation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import scikitplot as skplt
from sklearn.metrics import classification_report

In [13]:
# Read in fresh cleaned dataset
df = pd.read_csv('chicago_data_cleaned.csv')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4273756 entries, 0 to 4273755
Data columns (total 15 columns):
Date                    object
ID                      int64
Block                   int64
Primary Type            int64
Description             int64
Location Description    int64
Arrest                  bool
Domestic                bool
District                float64
Year                    int64
Latitude                float64
Longitude               float64
Month                   int64
Day                     int64
Hour                    int64
dtypes: bool(2), float64(3), int64(9), object(1)
memory usage: 432.0+ MB


In [15]:
df.head(3)

Unnamed: 0,Date,ID,Block,Primary Type,Description,Location Description,Arrest,Domestic,District,Year,Latitude,Longitude,Month,Day,Hour
0,2006-04-02 13:00:00,4673626,23279,12,173,66,False,False,16.0,2006,41.981913,-87.771996,4,2,13
1,2006-02-26 13:40:48,4673627,26672,10,217,75,True,False,3.0,2006,41.775733,-87.61192,2,26,13
2,2006-01-08 23:16:00,4673628,6596,0,40,58,False,False,3.0,2006,41.769897,-87.593671,1,8,23


In [16]:
df_for_model = df.drop(['ID','Location Description', 'Latitude', 'Longitude', 'Date'], axis=1)

labels = df['Location Description']

In [17]:
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4273756 entries, 0 to 4273755
Data columns (total 10 columns):
Block           int64
Primary Type    int64
Description     int64
Arrest          bool
Domestic        bool
District        float64
Year            int64
Month           int64
Day             int64
Hour            int64
dtypes: bool(2), float64(1), int64(7)
memory usage: 269.0 MB


In [18]:
def evaluate_features(X, y, key=0):
    """
    General helper function for evaluating effectiveness of passed features in ML model

    Prints out Log loss, accuracy, and confusion matrix with 3-fold stratified cross-validation

    Parameters
    ----------
    X : Features array

    y : Labels array

    key: 0 = DecisionTreeClassifier (Default)
         1 = ExtraTreeClassifier
         2 = RandomForestClassifier
         3 = KNeighborsClassifier
         4 = GaussianNB
    """

    clf = [DecisionTreeClassifier(),
           ExtraTreeClassifier(),
           RandomForestClassifier(),
           KNeighborsClassifier(),
           GaussianNB()]
    

    probabilities = cross_val_predict(clf[key], X, y, cv=StratifiedKFold(n_splits=2, random_state=8),
                                      n_jobs=-1, method='predict_proba', verbose=2)
    predicted_indices = np.argmax(probabilities, axis=1)
    classes = np.unique(y)
    predicted = classes[predicted_indices]
    print('Log loss: {}'.format(log_loss(y, probabilities)))
    print('Accuracy: {}'.format(accuracy_score(y, predicted)))
    #skplt.metrics.plot_confusion_matrix(y, predicted, normalize=True, figsize=(20,10))
    
    print(classification_report(y, predicted))
    #plt.show()

## DecisionTreeClassifier:

In [19]:
evaluate_features(df_for_model, labels)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.0min finished


Log loss: 33.25500208494404
Accuracy: 0.03714999171688791
              precision    recall  f1-score   support

           0       0.01      0.04      0.01      8626
           1       0.00      0.05      0.00       541
           2       0.00      0.05      0.00       515
           3       0.00      0.16      0.00       498
           4       0.04      0.27      0.06      1225
           5       0.01      0.05      0.01       472
           6       0.07      0.35      0.12      3336
           7       0.00      0.40      0.01       631
           8       0.04      0.67      0.08      8948
           9       0.03      0.00      0.01     96996
          10       0.00      0.01      0.00       569
          11       0.05      0.01      0.01    481218
          12       0.00      0.00      0.00      1075
          13       0.04      0.09      0.05      5907
          14       0.01      0.02      0.01      6239
          15       0.09      0.18      0.12       623
          16       0.11

## ExtraTreeClassifier:

In [20]:
evaluate_features(df_for_model, labels, 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   37.1s finished


Log loss: 31.45233191228527
Accuracy: 0.08931464501015032
              precision    recall  f1-score   support

           0       0.01      0.04      0.01      8626
           1       0.00      0.04      0.00       541
           2       0.00      0.03      0.01       515
           3       0.00      0.08      0.00       498
           4       0.03      0.15      0.04      1225
           5       0.01      0.04      0.01       472
           6       0.08      0.44      0.13      3336
           7       0.01      0.41      0.02       631
           8       0.03      0.60      0.06      8948
           9       0.05      0.01      0.02     96996
          10       0.00      0.00      0.00       569
          11       0.23      0.12      0.16    481218
          12       0.01      0.01      0.01      1075
          13       0.03      0.08      0.05      5907
          14       0.05      0.11      0.07      6239
          15       0.03      0.14      0.05       623
          16       0.10

## RandomForestClassifier:

In [21]:
evaluate_features(df_for_model,labels,2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.9min finished


Log loss: 26.69653852821965
Accuracy: 0.03928651986683376
              precision    recall  f1-score   support

           0       0.01      0.03      0.01      8626
           1       0.00      0.06      0.00       541
           2       0.00      0.01      0.00       515
           3       0.00      0.15      0.01       498
           4       0.13      0.40      0.19      1225
           5       0.01      0.03      0.02       472
           6       0.11      0.48      0.18      3336
           7       0.02      0.52      0.04       631
           8       0.03      0.74      0.06      8948
           9       0.02      0.00      0.00     96996
          10       0.00      0.00      0.00       569
          11       0.06      0.01      0.02    481218
          12       0.01      0.00      0.00      1075
          13       0.03      0.05      0.04      5907
          14       0.03      0.05      0.04      6239
          15       0.03      0.07      0.04       623
          16       0.12

## KNeighborsClassifier:

In [22]:
evaluate_features(df_for_model,labels,3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


Log loss: 13.617796118775468
Accuracy: 0.34149937432085503
              precision    recall  f1-score   support

           0       0.01      0.02      0.01      8626
           1       0.05      0.11      0.07       541
           2       0.06      0.09      0.07       515
           3       0.10      0.15      0.12       498
           4       0.37      0.52      0.43      1225
           5       0.04      0.03      0.03       472
           6       0.60      0.75      0.67      3336
           7       0.35      0.58      0.44       631
           8       0.72      0.90      0.80      8948
           9       0.05      0.06      0.06     96996
          10       0.01      0.01      0.01       569
          11       0.28      0.41      0.34    481218
          12       0.02      0.01      0.01      1075
          13       0.23      0.35      0.28      5907
          14       0.16      0.20      0.18      6239
          15       0.05      0.02      0.03       623
          16       0.1

## GaussianNB:

In [23]:
evaluate_features(df_for_model,labels,4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.1min finished


Log loss: 4.847206239878137
Accuracy: 0.15960808244551164


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      8626
           1       0.00      0.00      0.00       541
           2       0.00      0.00      0.00       515
           3       0.00      0.00      0.00       498
           4       0.01      0.08      0.02      1225
           5       0.00      0.00      0.00       472
           6       0.02      0.38      0.04      3336
           7       0.01      0.13      0.01       631
           8       0.06      0.38      0.11      8948
           9       0.00      0.00      0.00     96996
          10       0.00      0.00      0.00       569
          11       0.20      0.16      0.18    481218
          12       0.00      0.00      0.00      1075
          13       0.00      0.00      0.00      5907
          14       0.09      0.02      0.03      6239
          15       0.11      1.00      0.20       623
          16       0.00      0.00      0.00     17069
          17       0.06    