In [83]:
%matplotlib inline

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [84]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [85]:
get_header()

['Dates',
 'Category',
 'Descript',
 'DayOfWeek',
 'PdDistrict',
 'Resolution',
 'Address',
 'X',
 'Y']

In [86]:
dataframe = get_dataframe()

In [87]:
number_of_categories = dataframe['Category'].nunique()
print number_of_categories

39


In [88]:
# for writing code, take a small sample so that it doesn't take forever to run
df = dataframe

In [89]:
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [90]:
# Split train data into train and dev 80:20, and separate into X,Y

# shuffle the data:
df = df.reindex(np.random.permutation(df.index))

# take 80% from the top:
upper = np.floor(len(df)*.8).astype(int)
train_data = df.head(n=upper)
train_labels = train_data['Category']
del train_data['Category']
del train_data['Descript']
del train_data['Resolution']




# take 20% from the bottom
lower = np.ceil(len(df)*.2).astype(int)
dev_data = df.tail(n=lower)
dev_labels = dev_data['Category']
del dev_data['Category']

testdf = get_dataframe("test.csv")
test_data = testdf.reindex(np.random.permutation(testdf.index))
print test_data.shape

(884262, 7)


In [91]:
class DatesToTimeTransformer(TransformerMixin):

    def transform(self, X, **transform_params):
        
        # numeric variables
        if('Dates' in X):
            X['Time'] = (X['Dates']).dt.hour
            del X['Dates']
            del X['Address']
        return  X

    def fit(self, X, y=None, **fit_params):
        return self

class CategoricalFeatureTransformer(TransformerMixin):

    def transform(self, X, **transform_params):
        
        # numeric variables
        numeric_cols = [ 'X', 'Y','Time']
        x_num_train = X[ numeric_cols ].as_matrix()
        cat_train = X.drop( [ 'X', 'Y','Time'], axis = 1 )
        cat_train.fillna( 'NA', inplace = True )
        x_cat_train = cat_train.T.to_dict().values()
        return  (x_num_train, x_cat_train)

    def fit(self, X, y=None, **fit_params):
        return self
    
class FeatureVecotrizerandCombiner(TransformerMixin):
    def __init__(self):
        self.vectorizer = DV( sparse = False )

    def transform(self, X,y=None, **transform_params):
        x_num_train, x_cat_train = X
        vec_x_cat_train = self.vectorizer.transform( x_cat_train )
        return np.hstack(( x_num_train, vec_x_cat_train ))

    def fit(self, X,y=None, **fit_params):
        x_num_train, x_cat_train = X
        vec_x_cat_train = self.vectorizer.fit_transform( x_cat_train )
        return  self

In [92]:
# run a classifier
p = Pipeline(steps=[("DateToTime",DatesToTimeTransformer()), \
                    ("CategoricalFeatureTransformer",CategoricalFeatureTransformer()),\
                    ("FeatureVecotrizerandCombiner",FeatureVecotrizerandCombiner()),\
                    ("RandomForestClassifier",RandomForestClassifier())])
classifier = RandomForestClassifier()
model = p.fit(train_data,train_labels)
predictions = model.predict(dev_data)
score = model.score(dev_data,dev_labels)    
print score

0.177598086669


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [93]:
print predictions

['OTHER OFFENSES' 'OTHER OFFENSES' 'SUSPICIOUS OCC' ..., 'MISSING PERSON'
 'FRAUD' 'VEHICLE THEFT']


In [94]:
# TODO: write the submission file.
predictions = model.predict_proba(test_data)
withId  = np.column_stack((map(str,test_data.index.tolist()),predictions))
towrite = np.row_stack((["Id"] + model.named_steps['RandomForestClassifier'].classes_.tolist(),withId))

print(towrite)
write_submission('submission_file.csv', towrite)

[['Id' 'ARSON' 'ASSAULT' ..., 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']
 ['273867' '0.0' '0.0' ..., '0.0' '0.0' '0.0']
 ['282641' '0.0' '0.1' ..., '0.0' '0.0' '0.0']
 ..., 
 ['737655' '0.0' '0.0' ..., '0.0' '0.6' '0.0']
 ['229260' '0.0' '0.1' ..., '0.0' '0.0' '0.0']
 ['355328' '0.0' '0.1' ..., '0.0' '0.0' '0.0']]
