In [52]:
%matplotlib inline

import accelerate
#import numbapro
import mkl
#import iopro

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.cross_validation import StratifiedShuffleSplit

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.mixture import GMM
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin

from copy import deepcopy
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [3]:
TestD = get_dataframe("test.csv")
TestD.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [111]:
test_data = TestD.copy(deep=True)

In [7]:
TrainD = get_dataframe("train.csv")
TrainD.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [112]:
train_data = TrainD.copy(deep=True)

In [113]:
number_of_categories = train_data['Category'].nunique()
category_names = sorted(train_data['Category'].unique())
print number_of_categories
print category_names

39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [44]:
#labels = train_data["Category"].astype('category')


# Features

### MAKE REGIONS FROM lat/long 

In [114]:
# Train data:

min_x, max_x = np.min(train_data['X']), np.max(train_data['X'])
min_y, max_y = np.min(train_data['Y']), np.max(train_data['Y'])
print "Original max and min values"
print "---------------------------"
print (min_x, max_x,min_y, max_y)


# fix invalid values:
train_data.loc[train_data['X'] > -122.3, 'X'] = -122.3
train_data.loc[train_data['Y'] > 37.8, 'Y'] = 37.8

grid_size = 10
grid_width = np.max(train_data['X']) - np.min(train_data['X'])
grid_height = np.max(train_data['Y']) - np.min(train_data['Y'])

x_interval = grid_width/grid_size
y_interval = grid_height/grid_size

min_x, max_x = np.min(train_data['X']), np.max(train_data['X'])
min_y, max_y = np.min(train_data['Y']), np.max(train_data['Y'])

train_data['region'] = (10*(np.ceil((train_data['X'] - min_x)/x_interval)) + (np.ceil((train_data['Y'] - min_y)/y_interval))).astype(int)

print "New max and min values"
print "---------------------------"
print (min_x, max_x, min_y, max_y)

Original max and min values
---------------------------
(-122.51364206429, -120.5, 37.707879022413501, 90.0)
New max and min values
---------------------------
(-122.51364206429, -122.3, 37.707879022413501, 37.799999999999997)


In [115]:
# Test data:

min_x, max_x = np.min(test_data.loc[:,'X']), np.max(test_data.loc[:,'X'])
min_y, max_y = np.min(test_data.loc[:,'Y']), np.max(test_data.loc[:,'Y'])

print "Original max and min values"
print "---------------------------"
print (min_x, max_x,min_y, max_y)

test_data.loc[test_data['X'] > -122.3, 'X'] = -122.3
test_data.loc[test_data['Y'] > 37.8, 'Y'] = 37.8

grid_size = 10
grid_width = np.max(test_data.loc[:,'X']) - np.min(test_data.loc[:,'X'])
grid_height = np.max(test_data.loc[:,'Y']) - np.min(test_data.loc[:,'Y'])

x_interval = grid_width/grid_size
y_interval = grid_height/grid_size

min_x, max_x = np.min(test_data.loc[:,'X']), np.max(test_data.loc[:,'X'])
min_y, max_y = np.min(test_data.loc[:,'Y']), np.max(test_data.loc[:,'Y'])

test_data['region'] = (10*(np.ceil((test_data['X'] - min_x)/x_interval)) + (np.ceil((test_data['Y'] - min_y)/y_interval))).astype(int)

print "New max and min values"
print "---------------------------"
print (min_x, max_x, min_y, max_y)

Original max and min values
---------------------------
(-122.51364206429, -120.5, 37.707879022413501, 90.0)
New max and min values
---------------------------
(-122.51364206429, -122.3, 37.707879022413501, 37.799999999999997)


### PARSE THE DATE

In [116]:
train_data.loc[:,'Hour'] = (train_data.loc[:,'Dates']).dt.hour
train_data.loc[:,'Month'] = (train_data.loc[:,'Dates']).dt.month
train_data.loc[:,'Year'] = (train_data.loc[:,'Dates']).dt.year
train_data.loc[:,'Day'] = (train_data.loc[:,'Dates']).dt.day

In [117]:
test_data['Hour'] = (test_data['Dates']).dt.hour
test_data['Month'] = (test_data['Dates']).dt.month
test_data['Year'] = (test_data['Dates']).dt.year
test_data['Day'] = (test_data['Dates']).dt.day

### ADDRESSES

In [118]:
# train data
addresses=sorted(train_data["Address"].unique())
categories=sorted(train_data["Category"].unique())

C_counts=train_data.groupby(["Category"]).size()
A_C_counts=train_data.groupby(["Address","Category"]).size()

A_counts=train_data.groupby(["Address"]).size()

logodds={}
logoddsPA={}

MIN_CAT_COUNTS=2

default_logodds=np.log(C_counts/len(train_data))-np.log(1.0-C_counts/float(len(train_data)))

for addr in addresses:
    PA=A_counts[addr]/float(len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    for cat in A_C_counts[addr].keys():
        if (A_C_counts[addr][cat]>MIN_CAT_COUNTS) and A_C_counts[addr][cat]<A_counts[addr]:
            PA=A_C_counts[addr][cat]/float(A_counts[addr])
            logodds[addr][categories.index(cat)]=np.log(PA)-np.log(1.0-PA)
    logodds[addr]=pd.Series(logodds[addr])
    logodds[addr].index=range(len(categories))
    
print "Creating logodds address features"
address_features=train_data["Address"].apply(lambda x: logodds[x])
address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]

Creating logodds address features


In [119]:
train_data["IsInterection"]=train_data["Address"].apply(lambda x: 1 if "/" in x else 0)
train_data["logoddsPA"]=train_data["Address"].apply(lambda x: logoddsPA[x])

In [120]:
train_data = pd.concat([train_data, address_features], axis=1, join='inner')

In [121]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,region,...,logodds29,logodds30,logodds31,logodds32,logodds33,logodds34,logodds35,logodds36,logodds37,logodds38
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,58,...,-8.688077,-5.259591,-7.454398,-3.294016,-11.893691,-4.777894,-2.92489,-2.327278,-2.639057,-4.621396
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,58,...,-8.688077,-5.259591,-7.454398,-3.294016,-11.893691,-4.777894,-2.92489,-2.327278,-2.639057,-4.621396
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.8,60,...,-8.688077,-5.259591,-7.454398,-3.294016,-11.893691,-4.777894,-2.92489,-2.729575,-2.985679,-4.621396
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.8,60,...,-8.688077,-5.259591,-7.454398,-3.630985,-11.893691,-3.925268,-3.212187,-3.401197,-3.925268,-4.621396
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,47,...,-8.688077,-5.259591,-7.454398,-2.793208,-11.893691,-4.777894,-2.24071,-2.24071,-2.985679,-4.621396


In [122]:
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,region,Hour,Month,Year,Day
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,63,23,5,2015,10
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,63,23,5,2015,10
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,60,23,5,2015,10
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,42,23,5,2015,10
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,42,23,5,2015,10


In [123]:
#test data

new_addresses=sorted(test_data["Address"].unique())
new_A_counts=test_data.groupby("Address").size()
only_new=set(new_addresses+addresses)-set(addresses)
only_old=set(new_addresses+addresses)-set(new_addresses)
in_both=set(new_addresses).intersection(addresses)
in_either=set(new_addresses).union(addresses)

In [124]:
for addr in only_new:
    PA=new_A_counts[addr]/float(len(test_data)+len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    logodds[addr].index=range(len(categories))

In [127]:
for addr in in_both:
    PA=(A_counts[addr]+new_A_counts[addr])/float(len(test_data)+len(train_data))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
 

In [128]:
print "Creating logodds address features"
test_address_features=test_data["Address"].apply(lambda x: logodds[x])
test_address_features.columns=["logodds"+str(x) for x in range(len(test_address_features.columns))]

test_data["IsInterection"]=test_data["Address"].apply(lambda x: 1 if "/" in x else 0)
test_data["logoddsPA"]=test_data["Address"].apply(lambda x: logoddsPA[x])

test_data = pd.concat([test_data, test_address_features], axis=1, join='inner')

Creating logodds address features


### REMOVE UNUSED COLUMNS

In [129]:
del train_data['Dates']
del train_data['Descript']
del train_data['X']
del train_data['Y']
del train_data['Address']
del train_data['Resolution']
del train_data['Category']

del test_data['Dates']
del test_data['X']
del test_data['Y']
del test_data['Address']
del test_data['Id']

### MAKE DUMMY VARIABLES

In [130]:
le = preprocessing.LabelEncoder()
days = train_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
train_data.loc[:,'DayOfWeek'] = le.transform(train_data.loc[:,'DayOfWeek']) 


days = test_data.loc[:,'DayOfWeek'].unique()
le.fit(days)
test_data.loc[:,'DayOfWeek'] = le.transform(test_data.loc[:,'DayOfWeek'])


district = train_data.loc[:,'PdDistrict'].unique()
le.fit(district)
train_data.loc[:,'PdDistrict'] = le.transform(train_data.loc[:,'PdDistrict']) 


district = test_data.loc[:,'PdDistrict'].unique()
le.fit(district)
test_data.loc[:,'PdDistrict'] = le.transform(test_data.loc[:,'PdDistrict']) 


district = train_data.loc[:,'region'].unique()
le.fit(district)
train_data.loc[:,'region'] = le.transform(train_data.loc[:,'region']) 


district = test_data.loc[:,'region'].unique()
le.fit(district)
test_data.loc[:,'region'] = le.transform(test_data.loc[:,'region']) 

In [None]:
'''
# NOT USING THIS - Scale instead.
enc = OneHotEncoder()
train_data_onehot = enc.fit_transform(train_data) 
test_data_onehot = enc.fit_transform(test_data)

# print train_data_onehot.toarray()
print train_data_onehot.shape

#print test_data_onehot.toarray()
print test_data_onehot.shape

#print enc.n_values_
#print enc.feature_indices_
'''

### Scale the data

In [131]:
collist=train_data.columns.tolist()
scaler = preprocessing.StandardScaler()
scaler.fit(train_data)
train_data[collist]=scaler.transform(train_data)

collisttest=test_data.columns.tolist()
scalertest = preprocessing.StandardScaler()
scalertest.fit(test_data)
test_data[collisttest]=scaler.transform(test_data)

# Make train and dev sets

In [132]:
sss = StratifiedShuffleSplit(labels, train_size=0.5)

for train_index, dev_index in sss:
    X_train, X_dev = train_data.iloc[train_index], train_data.iloc[dev_index]
    y_train, y_dev = labels[train_index],labels[dev_index]
    
X_train.index=range(len(X_train))
X_dev.index=range(len(X_dev))

y_train.index=range(len(y_train))
y_dev.index=range(len(y_dev))

train_data.index=range(len(train_data))
labels.index=range(len(labels))


# Classifiers

In [None]:
classifier = RandomForestClassifier()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = DecisionTreeClassifier()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [32]:
classifier = LogisticRegression()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

0.202750166276


In [None]:
classifier = BernoulliNB()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

In [None]:
classifier = MultinomialNB()
model = classifier.fit(X_train, y_train)
print model.score(X_dev, y_dev)

# Predict and write file

In [None]:
predictions = model.predict_proba(test_data)
withId  = np.column_stack((map(str,xrange(test_data.shape[0])),predictions))
towrite = np.row_stack((["Id"] + sorted(y_train.unique()),withId))

print(towrite)
write_submission('submission_MultiNB.csv', towrite)

# NEURAL NETWORK

In [133]:
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils


In [134]:
y_train = y_train.astype('category')
y_train = y_train.cat.rename_categories(range(len(y_train.unique())))

print y_train.shape
print X_train.shape

y_dev = y_dev.astype('category')
y_dev = y_dev.cat.rename_categories(range(len(y_dev.unique())))


(439024,)
(439024, 48)


In [None]:
output_dim = 39
input_dim = X_train.shape[1]
Nlayers = 1
Nepoch = 20
dp = 0.5



model = Sequential()
model.add(Dense(input_dim=input_dim, output_dim=output_dim, init='glorot_uniform'))
model.add(PReLU(input_shape=(input_dim,)))
model.add(Dropout(dp))

for i in range(Nlayers):
    model.add(Dense(input_dim=input_dim, output_dim=output_dim,init='glorot_uniform'))
    model.add(PReLU(input_shape=(input_dim,)))
    model.add(BatchNormalization())
    model.add(Dropout(dp))

model.add(Dense(input_dim=input_dim, output_dim=output_dim,init='glorot_uniform'))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    

#model.fit(train_data_onehot.toarray(), y_train, nb_epoch=Nepoch, batch_size=32)
model.fit(X_train.as_matrix() , y_train, nb_epoch=Nepoch, batch_size=32)

Epoch 1/20
Epoch 2/20