In [1]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  

matplotlib.style.use('ggplot')

import numpy as np
import pandas as pd
import random
import skflow
import tensorflow as tf

from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
def make_predictors(data):
    for col in ['DayOfWeek','PdDistrict']:
        dummies = pd.get_dummies(data[col])
        data[col[0:3]+"_"+dummies.columns] = dummies

    data['PandasDates'] = pd.to_datetime(data['Dates'])

    # cat_dict = {}
    # for col in ['Category','DayOfWeek','PdDistrict']:
    #     s = pd.Series(data[col], dtype="category")
    #     cat_dict[col] = s.cat.categories
    #     s.cat.categories = [int(num) for num in range(0,len(s.cat.categories))]
    #     data[col[0:3]+'Num'] = pd.Series(s, dtype="int")

    data[['X','Y']] = preprocessing.normalize(data[['X','Y']], norm='l2')
    data['Year'] = data['PandasDates'].dt.year
    data['Month'] = data['PandasDates'].dt.month
    data['Day'] = data['PandasDates'].dt.dayofyear
    data['Hour'] = data['PandasDates'].dt.hour
    data['Minute'] = data['PandasDates'].dt.minute

    return data

In [3]:
data = make_predictors(pd.read_csv('train.csv'))
test_data = make_predictors(pd.read_csv('test.csv'))

In [4]:
data.columns

Index([u'Dates', u'Category', u'Descript', u'DayOfWeek', u'PdDistrict',
       u'Resolution', u'Address', u'X', u'Y', u'Day_Friday', u'Day_Monday',
       u'Day_Saturday', u'Day_Sunday', u'Day_Thursday', u'Day_Tuesday',
       u'Day_Wednesday', u'PdD_BAYVIEW', u'PdD_CENTRAL', u'PdD_INGLESIDE',
       u'PdD_MISSION', u'PdD_NORTHERN', u'PdD_PARK', u'PdD_RICHMOND',
       u'PdD_SOUTHERN', u'PdD_TARAVAL', u'PdD_TENDERLOIN', u'PandasDates',
       u'Year', u'Month', u'Day', u'Hour', u'Minute'],
      dtype='object')

In [5]:
from sklearn.cross_validation import train_test_split
y = data['Category']
train_cols = [col for col in data.columns if col not in ['DayOfWeek','PandasDates', 'PdDistrict','Category','Address','Dates','Descript','Resolution']]
X = data[train_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=42)

In [6]:
y_OH = pd.get_dummies(y)

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
import h5py

def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1,
                init_mode='glorot_uniform',
                batch_norm=True,
                opt='adam'):
    model = Sequential()
    model.add(Dense(hn, input_dim=input_dim, init=init_mode))
    model.add(Activation('relu'))
    if batch_norm:
        model.add(BatchNormalization())
    model.add(Dropout(dp))

    for i in xrange(layers):
        model.add(Dense(hn, init=init_mode))
        model.add(Activation('relu'))
        if batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dp))

    model.add(Dense(output_dim, init=init_mode))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=opt)
    return model

Using TensorFlow backend.


In [None]:
X_train, X_test, y_train_OH, y_test_OH = train_test_split(X, y_OH, train_size=0.8, random_state=42)

In [None]:
from keras.optimizers import Adam

input_dim = len(X_train.columns)
output_dim = len(y_test_OH.columns)

adam = Adam(3e-4)
model = build_model(input_dim, output_dim, hn=64, dp=0.25, layers=2, opt=adam, init_mode='glorot_normal')

model.fit(X_train.as_matrix(),
          y_train_OH.as_matrix(),
          nb_epoch=10,
          batch_size=16,
          validation_split=0.1,
          show_accuracy=True,
          verbose=True)

Train on 632195 samples, validate on 70244 samples
Epoch 1/10
 46144/632195 [=>............................] - ETA: 1065s - loss: 2.7699 - acc: 0.2019

In [51]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(2e-6))
model.fit(X_train.as_matrix(),
          y_train_OH.as_matrix(),
          nb_epoch=10,
          batch_size=16,
          validation_split=0.2,
          show_accuracy=True,
          verbose=True)

Train on 35121 samples, validate on 8781 samples
Epoch 1/10

KeyboardInterrupt: 

In [91]:
ntest=2000
score, acc = model.evaluate(X_test.as_matrix()[:ntest],
                       y_test_OH.as_matrix()[:ntest],
                       batch_size=16,
                       show_accuracy=True,
                       verbose=0)
print "Test Score: ", score
print "Test Accuracy: ", acc

Test Score:  2.60238179111
Test Accuracy:  0.2385


In [61]:
X_final_test = test_data[train_cols].as_matrix()

In [62]:
pred = model.predict_proba(X_final_test, verbose=1)



In [73]:
labels = list(y_OH.columns)
print labels

['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [82]:
import csv
from copy import copy

In [90]:
ntop = 10
with open('sf-nn.csv', 'w') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(['Id'] + labels)
    for i, p in enumerate(pred):
        thresh_idx = p < sorted(pred[0],reverse=True)[ntop-1]
        new_p = copy(p)
        new_p[thresh_idx] = 0
        fo.writerow([i] + list(new_p))

In [None]:
model.save_weights('SF_weights.h5')
model.load_weights('SF_weights.h5')