# SF Crime Classification


Project Setup

In [24]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *




In [13]:
Location = r'C:\Users\bhushanr\Downloads\data\train.csv'
Location1 = r'C:\Users\bhushanr\Downloads\data\test.csv'

train = pd.read_csv(Location)
test = pd.read_csv(Location1)
train.shape


(878049, 9)

In [15]:
#Feature extraction
def time_features(train):
    train['DateTime'] = pd.to_datetime(train['Dates'])
    train['Year'] = pd.DatetimeIndex(train['DateTime']).year
    train['Month'] = pd.DatetimeIndex(train['DateTime']).month
    train['Day'] = pd.DatetimeIndex(train['DateTime']).day
    train['Hour'] = pd.DatetimeIndex(train['DateTime']).hour
    
    months = pd.get_dummies(train.Month)
    daysofweek = pd.get_dummies(train.DayOfWeek)
    hours = pd.get_dummies(train.Hour)
    districts = pd.get_dummies(train.PdDistrict)
    train_mod = pd.concat([train, months, daysofweek, hours, districts], axis=1)
    return train_mod

train = time_features(train)
test = time_features(test)

print(train.columns.values)

['Dates' 'Category' 'Descript' 'DayOfWeek' 'PdDistrict' 'Resolution'
 'Address' 'X' 'Y' 'DateTime' 'Year' 'Month' 'Day' 'Hour' 1 2 3 4 5 6 7 8 9
 10 11 12 'Friday' 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday'
 'Wednesday' 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 'BAYVIEW' 'CENTRAL' 'INGLESIDE' 'MISSION' 'NORTHERN' 'PARK' 'RICHMOND'
 'SOUTHERN' 'TARAVAL' 'TENDERLOIN']


In [18]:
#Insert labels
labels = train.Category
#Drop Category, Resolution and Description columns as they aren't required for prediction
train_data = train.drop(['Category', 'Descript', 'Resolution'], axis=1)
train_names = train_data.columns.values.tolist()
test_names = test.columns.values.tolist()

In [23]:
#SHuffle the input 
shuffle = np.random.permutation(np.arange(train_data.shape[0]))
train_data = train_data.reindex(shuffle)
labels = labels.reindex(shuffle)
num_data = train_data.shape[0]

# Split the feature into train and dev data sets
mini_train_data = train_data[:5000]
mini_train_labels = labels[:5000]

train_data_new = train_data[5001:int(num_data/2)]
train_labels_new = labels[5001:int(num_data/2)]

dev_data = train_data[int(num_data/2) + 1:]
dev_labels = labels[int(num_data/2) + 1:]

test_data = test.copy()

print("Columns in use:", train_names)

Columns in use: ['Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'DateTime', 'Year', 'Month', 'Day', 'Hour', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']


In [31]:
features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 
            'Wednesday', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
            'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 
            'TENDERLOIN']

BNB = BernoulliNB()
BNB.fit(train_data_new[features], train_labels_new)

predictions = BNB.predict_proba(test_data[features])
print("BernoulliNB accuracy:", BNB.score(dev_data[features], dev_labels))
print("Log Loss:", log_loss(dev_labels, BNB.predict_proba(dev_data[features])))

BernoulliNB accuracy: 0.217646415686
Log Loss: 2.61415312445
