In [1]:
%matplotlib inline

import accelerate
#import numbapro
import mkl
#import iopro

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.mixture import GMM
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [3]:
TD = get_dataframe("test.csv")

In [4]:
print TD.head()

   Id               Dates DayOfWeek PdDistrict                   Address  \
0   0 2015-05-10 23:59:00    Sunday    BAYVIEW   2000 Block of THOMAS AV   
1   1 2015-05-10 23:51:00    Sunday    BAYVIEW        3RD ST / REVERE AV   
2   2 2015-05-10 23:50:00    Sunday   NORTHERN    2000 Block of GOUGH ST   
3   3 2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   
4   4 2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   

            X          Y  
0 -122.399588  37.735051  
1 -122.391523  37.732432  
2 -122.426002  37.792212  
3 -122.437394  37.721412  
4 -122.437394  37.721412  


In [5]:
test_data = TD.copy(deep=True)

In [6]:
dataframe = get_dataframe()

In [7]:
number_of_categories = dataframe['Category'].nunique()
category_names = sorted(dataframe['Category'].unique())
print number_of_categories
print category_names

39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [8]:
df = dataframe.copy(deep=True)
min_x, max_x = np.min(df['X']), np.max(df['X'])
min_y, max_y = np.min(df['Y']), np.max(df['Y'])
print (min_x, max_x,min_y, max_y)
# Remove outliers:
print max(df['Dates'])

df.loc[df['X'] > -122.3, 'X'] = -122.3
df.loc[df['Y'] > 37.8, 'Y'] = 37.8

grid_size = 10
grid_width = np.max(df['X']) - np.min(df['X'])
grid_height = np.max(df['Y']) - np.min(df['Y'])


x_interval = grid_width/grid_size
y_interval = grid_height/grid_size

min_x, max_x = np.min(df['X']), np.max(df['X'])
min_y, max_y = np.min(df['Y']), np.max(df['Y'])

df['region'] = (10*(np.ceil((df['X'] - min_x)/x_interval)) + (np.ceil((df['Y'] - min_y)/y_interval))).astype(int)

print (min_x, max_x,min_y, max_y)
#df.to_csv(os.path.join(data_path, 'modifiedTrain.csv'))
    
#df.head()

(-122.51364206429, -120.5, 37.707879022413501, 90.0)
2015-05-13 23:53:00
(-122.51364206429, -122.3, 37.707879022413501, 37.799999999999997)


In [9]:
# for writing code, take a small sample so that it doesn't take forever to run
#print dataframe.shape
#df = dataframe.sample(150000)
number_of_categories = df['Category'].nunique()
print number_of_categories
print df.shape

39
(878049, 10)


In [10]:

df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,region
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,58
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,58
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.8,60
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.8,60
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,47


In [11]:
df.loc[:,'Hour'] = (df.loc[:,'Dates']).dt.hour
df.loc[:,'Month'] = (df.loc[:,'Dates']).dt.month
df.loc[:,'Year'] = (df.loc[:,'Dates']).dt.year
df.loc[:,'Day'] = (df.loc[:,'Dates']).dt.day

# remove 2015 because it is incomplete
# df = df[df['Year'] != 2015]

In [12]:
# Split train data into train and dev 80:20, and separate into X,Y
# train_data = df
# shuffle the data:
df = df.reindex(np.random.permutation(df.index))

# take 80% from the top:
upper = np.floor(len(df)*.8).astype(int)
train_data = df.head(n=upper)
train_labels = train_data.loc[:,('Category')]


# take 20% from the bottom
lower = np.ceil(len(df)*.2).astype(int)
dev_data = df.tail(n=lower)
dev_labels = dev_data.loc[:,('Category')]


In [13]:
del train_data['Dates']
del train_data['Category']
del train_data['Descript']
del train_data['X']
del train_data['Y']
del train_data['Address']
del train_data['Resolution']


del dev_data['Dates']
del dev_data['Category']
del dev_data['Descript']
del dev_data['X']
del dev_data['Y']
del dev_data['Address']
del dev_data['Resolution']

In [14]:
dev_data.head()
train_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,region,Hour,Month,Year,Day
488257,Friday,SOUTHERN,58,19,6,2008,27
209292,Sunday,BAYVIEW,64,13,7,2012,29
54280,Saturday,INGLESIDE,54,0,8,2014,23
684665,Saturday,TENDERLOIN,59,20,8,2005,27
130970,Monday,CENTRAL,59,21,8,2013,19


In [15]:
test_data = TD.copy(deep=True)
# fix outliers:
min_x, max_x = np.min(test_data.loc[:,'X']), np.max(test_data.loc[:,'X'])
min_y, max_y = np.min(test_data.loc[:,'Y']), np.max(test_data.loc[:,'Y'])

print (min_x, max_x,min_y, max_y)
# df[df[['A']]<0] = 0
print max(test_data['Dates'])

test_data.loc[test_data['X'] > -122.3, 'X'] = -122.3
test_data.loc[test_data['Y'] > 37.8, 'Y'] = 37.8


grid_size = 10
grid_width = np.max(test_data.loc[:,'X']) - np.min(test_data.loc[:,'X'])
grid_height = np.max(test_data.loc[:,'Y']) - np.min(test_data.loc[:,'Y'])


x_interval = grid_width/grid_size
y_interval = grid_height/grid_size

min_x, max_x = np.min(test_data.loc[:,'X']), np.max(test_data.loc[:,'X'])
min_y, max_y = np.min(test_data.loc[:,'Y']), np.max(test_data.loc[:,'Y'])

print (min_x, max_x,min_y, max_y)

(-122.51364206429, -120.5, 37.707879022413501, 90.0)
2015-05-10 23:59:00
(-122.51364206429, -122.3, 37.707879022413501, 37.799999999999997)


In [16]:
test_data['region'] = (10*(np.ceil((test_data['X'] - min_x)/x_interval)) + (np.ceil((test_data['Y'] - min_y)/y_interval))).astype(int)
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,region
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,63
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,63
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,60
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,42
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,42


In [17]:
test_data['Hour'] = (test_data['Dates']).dt.hour
test_data['Month'] = (test_data['Dates']).dt.month
test_data['Year'] = (test_data['Dates']).dt.year
test_data['Day'] = (test_data['Dates']).dt.day
#test_data = test_data[test_data['Year'] != 2015]

del test_data['Dates']
del test_data['X']
del test_data['Y']
del test_data['Address']
del test_data['Id']

In [18]:
train_data.head()
dev_data.head()


Unnamed: 0,DayOfWeek,PdDistrict,region,Hour,Month,Year,Day
478829,Monday,INGLESIDE,55,22,8,2008,18
543061,Sunday,SOUTHERN,59,22,9,2007,9
576285,Wednesday,MISSION,57,13,3,2007,21
247007,Saturday,BAYVIEW,63,14,1,2012,14
383410,Saturday,SOUTHERN,58,9,12,2009,26


In [19]:
le = preprocessing.LabelEncoder()
days = train_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
train_data.loc[:,'DayOfWeek'] = le.transform(train_data.loc[:,'DayOfWeek']) 

days = dev_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
dev_data.loc[:,'DayOfWeek'] = le.transform(dev_data.loc[:,'DayOfWeek'])

days = test_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
test_data.loc[:,'DayOfWeek'] = le.transform(test_data.loc[:,'DayOfWeek'])


district = train_data.loc[:,'PdDistrict'].unique()
le.fit(district)
train_data.loc[:,'PdDistrict'] = le.transform(train_data.loc[:,'PdDistrict']) 

district = dev_data.loc[:,'PdDistrict'].unique()
le.fit(district)
dev_data.loc[:,'PdDistrict'] = le.transform(dev_data.loc[:,'PdDistrict']) 

district = test_data.loc[:,'PdDistrict'].unique()
le.fit(district)
test_data.loc[:,'PdDistrict'] = le.transform(test_data.loc[:,'PdDistrict']) 


district = train_data.loc[:,'region'].unique()
le.fit(district)
train_data.loc[:,'region'] = le.transform(train_data.loc[:,'region']) 

district = dev_data.loc[:,'region'].unique()
le.fit(district)
dev_data.loc[:,'region'] = le.transform(dev_data.loc[:,'region']) 

district = test_data.loc[:,'region'].unique()
le.fit(district)
test_data.loc[:,'region'] = le.transform(test_data.loc[:,'region']) 





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [20]:
enc = OneHotEncoder()
train_data_onehot = enc.fit_transform(train_data) 
dev_data_onehot = enc.transform(dev_data)
test_data_onehot = enc.fit_transform(test_data)

# print train_data_onehot.toarray()
print train_data_onehot.shape
# print dev_data_onehot.toarray()
print dev_data_onehot.shape
#print test_data_onehot.toarray()
print test_data_onehot.shape
#print enc.n_values_

#print enc.feature_indices_

#print enc.transform([[0, 1, 1]]).toarray()

(702439, 165)
(175610, 165)
(884262, 165)


In [177]:
#cut out last feature from test data so it matches train data
#test_data_onehot = test_data_onehot[:,0:133]

In [115]:
print test_data_onehot.shape

(884262, 134)


### Add the X Y columns back in, but.. no need to do that since we made regions, so you ca skip this step

In [None]:
complete_train = np.hstack(( np.vstack(( tdX, tdY)).T, train_data_onehot.toarray() ))
complete_dev = np.hstack(( np.vstack(( ddX, ddY)).T, dev_data_onehot.toarray() ))
complete_test = np.hstack(( np.vstack(( ttdX, ttdY)).T, test_data_onehot.toarray() ))

print complete_test

In [None]:
classifier = RandomForestClassifier()
model = classifier.fit(train_data_onehot, train_labels)
print model.score(dev_data_onehot, dev_labels)

In [None]:
classifier = DecisionTreeClassifier()
model = classifier.fit(train_data_onehot, train_labels)
print model.score(dev_data_onehot, dev_labels)

In [180]:
classifier = LogisticRegression()
model = classifier.fit(train_data_onehot, train_labels)
print model.score(dev_data_onehot, dev_labels)

0.224839132168


In [179]:
classifier = BernoulliNB()
model = classifier.fit(train_data_onehot, train_labels)
print model.score(dev_data_onehot, dev_labels)

0.217755253118


In [178]:
classifier = MultinomialNB()
model = classifier.fit(train_data_onehot, train_labels)
print model.score(dev_data_onehot, dev_labels)

0.2213996925


In [None]:
0.232102539981

In [21]:
print train_data_onehot.shape
print test_data_onehot.shape

(702439, 165)
(884262, 165)


In [181]:
predictions = model.predict_proba(test_data_onehot)
withId  = np.column_stack((map(str,xrange(test_data_onehot.shape[0])),predictions))
towrite = np.row_stack((["Id"] + sorted(train_labels.unique()),withId))

print(towrite)
write_submission('submission_MultiNB.csv', towrite)

[['Id' 'ARSON' 'ASSAULT' ..., 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']
 ['0' '0.0045656606701' '0.135258880447' ..., '0.0796960747572'
  '0.0426359558471' '0.0259355499744']
 ['1' '0.0045656606701' '0.135258880447' ..., '0.0796960747572'
  '0.0426359558471' '0.0259355499744']
 ..., 
 ['884259' '0.00117203107684' '0.0961844755743' ..., '0.104080707252'
  '0.0331692220235' '0.00809171614975']
 ['884260' '0.00496019783078' '0.0943447349992' ..., '0.0537283843323'
  '0.0611544063119' '0.0119904625077']
 ['884261' '0.00129867572851' '0.0520495874234' ..., '0.0497441744272'
  '0.0286864430599' '0.00308922458776']]


In [212]:
print category_names
k = map(lambda x: abs(hash(x)),category_names)
len(np.unique(np.array(k)))

['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


39

In [70]:
y_train = map(lambda x: abs(hash(x)),train_labels) #np.array(train_labels)
#y_train = train_labels
X_train = np.array(train_data)
X_test = np.array(dev_data)
y_test = map(lambda x: abs(hash(x)),dev_labels)
#y_test = dev_labels

In [71]:
print np.array(y_train).shape
print X_train.shape
print np.array(y_test).shape
print X_test.shape

(702439,)
(702439, 7)
(175610,)
(175610, 7)


In [46]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD

In [76]:
model = Sequential()
model.add(Dense(1, init='uniform', input_dim=7))
model.add(Activation('softplus'))

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy')

model.fit(X_train, y_train, nb_epoch=3, batch_size=16, verbose=1, show_accuracy=True)

score = model.evaluate(X_test, y_test, batch_size=16)

print score


Epoch 1/3

ValueError: I/O operation on closed file