In [4]:
%matplotlib inline

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing. 
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [5]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)
    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [6]:
get_header()

['Dates',
 'Category',
 'Descript',
 'DayOfWeek',
 'PdDistrict',
 'Resolution',
 'Address',
 'X',
 'Y']

In [87]:
dataframe = get_dataframe()

In [30]:
number_of_categories = dataframe['Category'].nunique()
print number_of_categories

39


In [136]:
# for writing code, take a small sample so that it doesn't take forever to run
df = dataframe.sample(n=1000)
# df = dataframe # this is not a copy. 

df['Time'] = (dataframe['Dates']).dt.hour

del df['Dates']
del df['Address']


KeyError: 'Dates'

In [130]:
df.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y,Time
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.424363,37.800414,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,-122.426995,37.800873,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,-122.438738,37.771541,23


In [131]:
# Split train data into train and dev 80:20, and separate into X,Y

# shuffle the data:
df = df.reindex(np.random.permutation(df.index))

# take 80% from the top:
upper = np.floor(len(df)*.8).astype(int)
train_data = df.head(n=upper)
train_labels = train_data['Category']
del train_data['Category']

# take 20% from the bottom
lower = np.ceil(len(df)*.2).astype(int)
dev_data = df.tail(n=lower)
dev_labels = dev_data['Category']
del dev_data['Category']



In [132]:

# numeric variables
numeric_cols = [ 'X', 'Y','Time']
x_num_train = train_data[ numeric_cols ].as_matrix()
x_num_dev = dev_data[ numeric_cols ].as_matrix()

# categorical variables
cat_train = train_data.drop( [ 'X', 'Y','Time'], axis = 1 )
cat_dev = dev_data.drop( [ 'X', 'Y','Time'], axis = 1 )


cat_train.fillna( 'NA', inplace = True )
cat_dev.fillna( 'NA', inplace = True )


x_cat_train = cat_train.T.to_dict().values()
x_cat_dev = cat_dev.T.to_dict().values()

# vectorize
vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform( x_cat_train )
vec_x_cat_dev = vectorizer.transform( x_cat_dev )


In [133]:
# re-combine the categorical and numeric variables
x_train = np.hstack(( x_num_train, vec_x_cat_train ))
x_dev = np.hstack(( x_num_dev, vec_x_cat_dev ))

In [134]:
# run a classifier
classifier = RandomForestClassifier()
model = classifier.fit(x_train,train_labels)
predictions = model.predict(x_dev)

score = model.score(x_dev,dev_labels)
print score

0.192796537783


In [135]:
print predictions

['OTHER OFFENSES' 'LARCENY/THEFT' 'LARCENY/THEFT' ..., 'LARCENY/THEFT'
 'OTHER OFFENSES' 'LARCENY/THEFT']


In [137]:
# TODO: write the submission file.
write_submission('submission_file.csv', predictions)