First attempt at pulling data from Kaggle and recoding categorical variables of interest into dummy variables.

In [1]:
%matplotlib inline

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Structure
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture as GMM
from sklearn.preprocessing import LabelEncoder

# Others
from datetime import datetime
import zipfile
import os.path



# Import Data

In [2]:
# Load the data and examine it
if os.path.isfile('./train.csv'):
    train_data = pd.read_csv('./train.csv')
else:
    z = zipfile.ZipFile('./train.csv.zip')
    train_data = pd.read_csv(z.open('train.csv'))


In [3]:
if os.path.isfile('./test.csv'):
    test_data = pd.read_csv('./test.csv')
else:
    z = zipfile.ZipFile('./test.csv.zip')
    test_data = pd.read_csv(z.open('test.csv'))

# Data Cleansing

In [4]:
print('Before data cleansing, the train data contain %d samples, the test data contain %d samples.' % \
      (train_data.shape[0], test_data.shape[0]))

# Drop samples containing null fields
train_data = train_data.dropna()
test_data = test_data.dropna()

# The boundaries of valid longitude and latitude
lon_lat_box = (-122.5247, -122.3366, 37.699, 37.8299)

# Drop samples containing invalide longitude and latitude
train_data = train_data[train_data.X > lon_lat_box[0]]
train_data = train_data[train_data.X < lon_lat_box[1]]
train_data = train_data[train_data.Y > lon_lat_box[2]]
train_data = train_data[train_data.Y < lon_lat_box[3]]

test_data = test_data[test_data.X > lon_lat_box[0]]
test_data = test_data[test_data.X < lon_lat_box[1]]
test_data = test_data[test_data.Y > lon_lat_box[2]]
test_data = test_data[test_data.Y < lon_lat_box[3]]

print('After data cleansing, the train data contain %d samples, the test data contain %d samples.' % \
      (train_data.shape[0], test_data.shape[0]))

Before data cleansing, the train data contain 878049 samples, the test data contain 884262 samples.
After data cleansing, the train data contain 877982 samples, the test data contain 884186 samples.


In [5]:
# Show all available crime lables.
print(train_data['Category'].value_counts())

# Show the percentage of the mode in all data. If the prediction of the model is worse than always predicting the mode,
# then we should always predict the mode in the baseline establishment.
print('The percentage of the LARCENY/THEFT is: ', train_data['Category'].value_counts()[0] * 1.0 / train_data.shape[0])

LARCENY/THEFT                  174885
OTHER OFFENSES                 126165
NON-CRIMINAL                    92300
ASSAULT                         76872
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53772
VANDALISM                       44724
WARRANTS                        42206
BURGLARY                        36754
SUSPICIOUS OCC                  31412
MISSING PERSON                  25989
ROBBERY                         22999
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7325
STOLEN PROPERTY                  4539
SEX OFFENSES FORCIBLE            4387
DISORDERLY CONDUCT               4318
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [6]:
# Convert the date into a python datetime object.
train_data["Dates"] = pd.to_datetime(train_data["Dates"], format="%Y-%m-%d %H:%M:%S")
test_data["Dates"] = pd.to_datetime(test_data["Dates"], format="%Y-%m-%d %H:%M:%S")

In [57]:
print(train_data.shape, test_data.shape)

#let's back stuff up
# Before we train the models, we need to divide the data into train data and dev data.
train1_data = train_data.copy()

# split to train / dev 
shuffle = np.random.permutation(np.arange(train1_data.shape[0]))
train1_data = train1_data.iloc[shuffle]
crime_lable_encoder = LabelEncoder()
train1_labels = crime_lable_encoder.fit_transform(train1_data['Category'])

train1_data["Hour"] = train_data["Dates"].apply(lambda x: x.hour)

# drop unnecessary stuff
train1_data = train1_data.drop(['Category','Address','Dates','Descript','Resolution'], axis=1)
    
# normalize X and Y
train1_data[['X','Y']] = scale(train1_data[['X','Y']])

# let's create integer values for each categories (akin to as.Factor() in R)
train1_data['DayOfWeek'] = LabelEncoder().fit_transform(train1_data['DayOfWeek'])
train1_data['PdDistrict'] = LabelEncoder().fit_transform(train1_data['PdDistrict'])

mini_train_data = train1_data.iloc[:40000]
mini_train_labels = train1_labels[:40000]
mini_dev_data = train1_data.iloc[40000:50000]
mini_dev_labels = train1_labels[40000:50000]

print(mini_train_data.shape, mini_dev_data.shape)

((877982, 9), (884186, 7))
((40000, 5), (10000, 5))


In [58]:
pca_mod = PCA()
pca_mod.fit(train1_data)
train_2d = pca_mod.transform(mini_train_data)
test_2d = pca_mod.transform(mini_dev_data)

print(train_2d.shape, test_2d.shape)

((40000, 5), (10000, 5))


In [None]:
crimes = np.unique(mini_train_labels)
crime_labels = crime_lable_encoder.inverse_transform(crimes)
gmm_models = {}

for crime, crime_label in zip(crimes, crime_labels):
    positive_indices = np.array(np.where(mini_train_labels == crime))[0,:]
    if positive_indices.shape[0] < 4:
        continue
    train_2d_positive = train_2d[positive_indices,] 

    # negative (no LARCENY nor THEFT) clusters
    negative_indices = np.array(np.where(mini_train_labels != crime))[0,:]
    train_2d_negative = train_2d[negative_indices,]
    
    # positive clusters 
    positive_indices = np.array(np.where(mini_dev_labels == crime))[0,:]
    test_2d_data_positive = test_2d[positive_indices,]

    # negative  clusters
    negative_indices = np.array(np.where(mini_dev_labels != crime))[0,:]
    test_2d_data_negative = test_2d[negative_indices,]

    
    #print("for crime {0:d}".format(crime))
    #print("positive training sample size is {0:d}".format(train_2d_positive.shape[0]))
      
    #print("negative training sample size is {0:d}".format(train_2d_negative.shape[0]))
      
    #print("positive test sample size is {0:d}".format(test_2d_data_positive.shape[0]))
      
    #print("negative test sample size is {0:d}".format(test_2d_data_negative.shape[0]))

    gmm_positive = GMM(n_components = 4, covariance_type = 'full' )
    gmm_positive.fit(train_2d_positive)
    gmm_models[crime] = gmm_positive
    gmm_negative = GMM(n_components = 4, covariance_type = 'full' )
    gmm_negative.fit(train_2d_negative)
    log_probas_positive = gmm_positive.score_samples(test_2d)
    log_probas_negative = gmm_negative.score_samples(test_2d)


    predicted_int_labels = np.greater(log_probas_positive, log_probas_negative).astype(int)
    mini_dev_int_labels = np.array(mini_dev_labels == crime, dtype = int)
    accuracy = np.sum(np.equal(mini_dev_int_labels, predicted_int_labels)) * 1.0 /mini_dev_int_labels.shape[0]
    print("The one-versus-rest accuracy for {0:s}: {1:3f}".format(crime_label, accuracy))

The one-versus-rest accuracy for ARSON: 0.802300
The one-versus-rest accuracy for ASSAULT: 0.557400
The one-versus-rest accuracy for BAD CHECKS: 0.951700
The one-versus-rest accuracy for BRIBERY: 0.999500
The one-versus-rest accuracy for BURGLARY: 0.535200
The one-versus-rest accuracy for DISORDERLY CONDUCT: 0.712300
The one-versus-rest accuracy for DRIVING UNDER THE INFLUENCE: 0.729500
The one-versus-rest accuracy for DRUG/NARCOTIC: 0.599800
The one-versus-rest accuracy for DRUNKENNESS: 0.718200
The one-versus-rest accuracy for EMBEZZLEMENT: 0.840100
The one-versus-rest accuracy for EXTORTION: 0.998600
The one-versus-rest accuracy for FAMILY OFFENSES: 0.998300
The one-versus-rest accuracy for FORGERY/COUNTERFEITING: 0.636500
The one-versus-rest accuracy for FRAUD: 0.664800
The one-versus-rest accuracy for GAMBLING: 0.999900
The one-versus-rest accuracy for KIDNAPPING: 0.772100
The one-versus-rest accuracy for LARCENY/THEFT: 0.584600
The one-versus-rest accuracy for LIQUOR LAWS: 0.7536

In [82]:
predicted_labels = [-1] * len(test_2d)

for i in xrange(len(test_2d)):
    sample = test_2d[i].reshape(1, -1)
    max_score = -10e10
    for crime, gmm_model in gmm_models.items():
        score = gmm_model.score_samples(sample)[0]
        if score > max_score:
            predicted_labels[i] = crime
            max_score = score

In [84]:
accuracy = np.sum(np.equal(mini_dev_labels, predicted_labels)) * 1.0 / mini_dev_labels.shape[0]
print("The final accuracy is: {:3f}".format(accuracy))

The final accuracy is: 0.071200
