First attempt at pulling data from Kaggle and recoding categorical variables of interest into dummy variables.

In [3]:
%matplotlib inline

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Structure
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture as GMM

# Others
from datetime import datetime
import zipfile
import os.path





# Import Data

In [4]:
# Load the data and examine it
if os.path.isfile('./train.csv'):
    train_data = pd.read_csv('./train.csv')
else:
    z = zipfile.ZipFile('./train.csv.zip')
    train_data = pd.read_csv(z.open('train.csv'))


In [5]:
if os.path.isfile('./test.csv'):
    test_data = pd.read_csv('./test.csv')
else:
    z = zipfile.ZipFile('./test.csv.zip')
    test_data = pd.read_csv(z.open('test.csv'))

# Data Cleansing

In [7]:
print('Before data cleansing, the train data contain %d samples, the test data contain %d samples.' % \
      (train_data.shape[0], test_data.shape[0]))

# Drop samples containing null fields
train_data = train_data.dropna()
test_data = test_data.dropna()

# The boundaries of valid longitude and latitude
lon_lat_box = (-122.5247, -122.3366, 37.699, 37.8299)

# Drop samples containing invalide longitude and latitude
train_data = train_data[train_data.X > lon_lat_box[0]]
train_data = train_data[train_data.X < lon_lat_box[1]]
train_data = train_data[train_data.Y > lon_lat_box[2]]
train_data = train_data[train_data.Y < lon_lat_box[3]]

test_data = test_data[test_data.X > lon_lat_box[0]]
test_data = test_data[test_data.X < lon_lat_box[1]]
test_data = test_data[test_data.Y > lon_lat_box[2]]
test_data = test_data[test_data.Y < lon_lat_box[3]]

print('After data cleansing, the train data contain %d samples, the test data contain %d samples.' % \
      (train_data.shape[0], test_data.shape[0]))

Before data cleansing, the train data contain 877982 samples, the test data contain 884186 samples.
After data cleansing, the train data contain 877982 samples, the test data contain 884186 samples.


In [10]:
# Show all available crime lables.
print(train_data['Category'].value_counts())

# Show the percentage of the mode in all data. If the prediction of the model is worse than always predicting the mode,
# then we should always predict the mode in the baseline establishment.
print('The percentage of the LARCENY/THEFT is: ', train_data['Category'].value_counts()[0] * 1.0 / train_data.shape[0])

LARCENY/THEFT                  174885
OTHER OFFENSES                 126165
NON-CRIMINAL                    92300
ASSAULT                         76872
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53772
VANDALISM                       44724
WARRANTS                        42206
BURGLARY                        36754
SUSPICIOUS OCC                  31412
MISSING PERSON                  25989
ROBBERY                         22999
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7325
STOLEN PROPERTY                  4539
SEX OFFENSES FORCIBLE            4387
DISORDERLY CONDUCT               4318
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [11]:
# Convert the date into a python datetime object.
train_data["Dates"] = pd.to_datetime(train_data["Dates"], format="%Y-%m-%d %H:%M:%S")
test_data["Dates"] = pd.to_datetime(test_data["Dates"], format="%Y-%m-%d %H:%M:%S")

In [42]:
from sklearn.preprocessing import LabelEncoder

print(train_data.shape, test_data.shape)

#let's back stuff up
# Before we train the models, we need to divide the data into train data and dev data.
train1_data = train_data.copy()

# split to train / dev 
shuffle = np.random.permutation(np.arange(train1_data.shape[0]))
train1_data = train1_data.iloc[shuffle]
train1_labels = train1_data['Category']

train1_data["Hour"] = train_data["Dates"].apply(lambda x: x.hour)

# drop unnecessary stuff
train1_data = train1_data.drop(['Category','Address','Dates','Descript','Resolution'], axis=1)
    

# normalize X and Y
train1_data[['X','Y']] = scale(train1_data[['X','Y']])

# let's create integer values for each categories (akin to as.Factor() in R)
#train1_data['Descript'] = LabelEncoder().fit_transform(train1_data['Descript'])
#train1_data['Resolution'] = LabelEncoder().fit_transform(train1_data['Resolution'])
train1_data['DayOfWeek'] = LabelEncoder().fit_transform(train1_data['DayOfWeek'])
train1_data['PdDistrict'] = LabelEncoder().fit_transform(train1_data['PdDistrict'])

mini_train1_data = train1_data.iloc[:40000]
mini_train1_labels = train1_labels.iloc[:40000]


(877982, 9) (884186, 7)


In [43]:
pca_mod = PCA()
pca_mod.fit(train1_data)
train1_2d = pca_mod.transform(mini_train1_data)
print(train_data['Category'].unique)

<bound method Series.unique of 0                       WARRANTS
1                 OTHER OFFENSES
2                 OTHER OFFENSES
3                  LARCENY/THEFT
4                  LARCENY/THEFT
5                  LARCENY/THEFT
6                  VEHICLE THEFT
7                  VEHICLE THEFT
8                  LARCENY/THEFT
9                  LARCENY/THEFT
10                 LARCENY/THEFT
11                OTHER OFFENSES
12                     VANDALISM
13                 LARCENY/THEFT
14                  NON-CRIMINAL
15                  NON-CRIMINAL
16                       ROBBERY
17                       ASSAULT
18                OTHER OFFENSES
19                  NON-CRIMINAL
20                 LARCENY/THEFT
21                       ROBBERY
22                      WARRANTS
23                  NON-CRIMINAL
24                 LARCENY/THEFT
25                  NON-CRIMINAL
26                 LARCENY/THEFT
27                 LARCENY/THEFT
28                 LARCENY/THEFT
29          

In [44]:
for crime in train_data["Category"].unique():
    positive_indices = np.array(np.where(mini_train1_labels == crime))[0,:]
    train1_2d_positive = train1_2d[positive_indices,]
  

    # negative (no LARCENY nor THEFT) clusters
    negative_indices = np.array(np.where(mini_train1_labels != crime))[0,:]
    train1_2d_negative = train1_2d[negative_indices,]

    mini_dev_data = train1_data.iloc[40000:50000]
    mini_dev_labels = train1_labels.iloc[40000:50000]

    
    
    test_2d_data = pca_mod.transform(mini_dev_data)
    # positive clusters 
    positive_indices = np.array(np.where(mini_dev_labels == crime))[0,:]
    test_2d_data_positive = test_2d_data[positive_indices,]

    # negative  clusters
    negative_indices = np.array(np.where(mini_dev_labels != crime))[0,:]
    test_2d_data_negative = test_2d_data[negative_indices,]

    
    print("for crime {0:s}".format(crime))
    print("positive training sample size is {0:d}".format(train1_2d_positive.shape[0]))
      
    print("negative training sample size is {0:d}".format(train1_2d_negative.shape[0]))
      
    print("positive test sample size is {0:d}".format(test_2d_data_positive.shape[0]))
      
    print("negative test sample size is {0:d}".format(test_2d_data_negative.shape[0]))

    gmm_positive = GMM(n_components = 4, covariance_type = 'full' )
    gmm_positive.fit(train1_2d_positive)
    gmm_negative = GMM(n_components = 4, covariance_type = 'full' )
    gmm_negative.fit(train1_2d_negative)
    log_probas_positive = gmm_positive.score_samples(test_2d_data)
    log_probas_negative = gmm_negative.score_samples(test_2d_data)


    predicted_int_labels = np.greater(log_probas_positive, log_probas_negative).astype(int)
    mini_dev_int_labels = np.array(mini_dev_labels == crime, dtype = int)
    accuracy = np.sum(np.equal(mini_dev_int_labels, predicted_int_labels))/mini_dev_int_labels.shape[0]
    print("The accuracy is for {0:s}: {1:3f}".format(crime, accuracy))

for crime WARRANTS
positive training sample size is 1979
negative training sample size is 38021
positive test sample size is 462
negative test sample size is 9538
The accuracy is for WARRANTS: 0.642400
for crime OTHER OFFENSES
positive training sample size is 5744
negative training sample size is 34256
positive test sample size is 1436
negative test sample size is 8564
The accuracy is for OTHER OFFENSES: 0.462300
for crime LARCENY/THEFT
positive training sample size is 8001
negative training sample size is 31999
positive test sample size is 2005
negative test sample size is 7995
The accuracy is for LARCENY/THEFT: 0.582200
for crime VEHICLE THEFT
positive training sample size is 2436
negative training sample size is 37564
positive test sample size is 587
negative test sample size is 9413
The accuracy is for VEHICLE THEFT: 0.604500
for crime VANDALISM
positive training sample size is 2043
negative training sample size is 37957
positive test sample size is 509
negative test sample size is

ValueError: Expected n_samples >= n_components but got n_components = 4, n_samples = 2