In [None]:
%matplotlib inline

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.mixture import GMM
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [None]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [None]:
get_header()

In [None]:
dataframe = get_dataframe()

In [None]:
number_of_categories = dataframe['Category'].nunique()
category_names = sorted(dataframe['Category'].unique())
print number_of_categories
print category_names

In [None]:
# for writing code, take a small sample so that it doesn't take forever to run
print dataframe.shape
df = dataframe.sample(150000)
number_of_categories = df['Category'].nunique()
print number_of_categories


In [None]:
df.head()

In [None]:
# Split train data into train and dev 80:20, and separate into X,Y
# train_data = df
# shuffle the data:
df = df.reindex(np.random.permutation(df.index))

# take 80% from the top:
upper = np.floor(len(df)*.8).astype(int)
train_data = df.head(n=upper)
train_labels = train_data['Category']
train_data['Hour'] = (train_data['Dates']).dt.hour
train_data['Month'] = (train_data['Dates']).dt.month
train_data['Year'] = (train_data['Dates']).dt.year


# take 20% from the bottom
lower = np.ceil(len(df)*.2).astype(int)
dev_data = df.tail(n=lower)
dev_labels = dev_data['Category']
dev_data['Hour'] = (dev_data['Dates']).dt.hour
dev_data['Month'] = (dev_data['Dates']).dt.month
dev_data['Year'] = (dev_data['Dates']).dt.year

In [None]:
train_data['Category'].nunique()

In [None]:
tdX, tdY = train_data['X'], train_data['Y']
ddX, ddY = dev_data['X'], dev_data['Y']

In [None]:
del train_data['Dates']
del train_data['Category']
del train_data['Descript']
del train_data['X']
del train_data['Y']
del train_data['Address']
del train_data['Resolution']


del dev_data['Dates']
del dev_data['Category']
del dev_data['Descript']
del dev_data['X']
del dev_data['Y']
del dev_data['Address']
del dev_data['Resolution']

In [None]:
dev_data.head()
train_data.head()

In [None]:
test_data = get_dataframe("test.csv")
print test_data.head()

In [None]:
ttdX, ttdY = test_data['X'], test_data['Y']


In [None]:
test_data['Hour'] = (test_data['Dates']).dt.hour
test_data['Month'] = (test_data['Dates']).dt.month
test_data['Year'] = (test_data['Dates']).dt.year
del test_data['Dates']
del test_data['X']
del test_data['Y']
del test_data['Address']
del test_data['Id']

In [None]:
test_data.head()

In [None]:
le = preprocessing.LabelEncoder()
days = train_data['DayOfWeek'].unique()
le.fit(days)
train_data['DayOfWeek'] = le.transform(train_data['DayOfWeek']) 

days = dev_data['DayOfWeek'].unique()
le.fit(days)
dev_data['DayOfWeek'] = le.transform(dev_data['DayOfWeek'])

days = test_data['DayOfWeek'].unique()
le.fit(days)
test_data['DayOfWeek'] = le.transform(test_data['DayOfWeek'])


district = train_data['PdDistrict'].unique()
le.fit(district)
train_data['PdDistrict'] = le.transform(train_data['PdDistrict']) 

district = dev_data['PdDistrict'].unique()
le.fit(district)
dev_data['PdDistrict'] = le.transform(dev_data['PdDistrict']) 

district = test_data['PdDistrict'].unique()
le.fit(district)
test_data['PdDistrict'] = le.transform(test_data['PdDistrict']) 


#print list(le.inverse_transform([2, 2, 1]))
#DayOfWeek, PdDistrict, Hour



In [None]:
enc = OneHotEncoder()
train_data_onehot = enc.fit_transform(train_data) 
dev_data_onehot = enc.transform(dev_data)
test_data_onehot = enc.fit_transform(test_data)

# print train_data_onehot.toarray()
print train_data_onehot.shape
# print dev_data_onehot.toarray()
print dev_data_onehot.shape
#print test_data_onehot.toarray()
print test_data_onehot.shape
#print enc.n_values_

#print enc.feature_indices_

#print enc.transform([[0, 1, 1]]).toarray()

In [None]:
complete_train = np.hstack(( np.vstack(( tdX, tdY)).T, train_data_onehot.toarray() ))
complete_dev = np.hstack(( np.vstack(( ddX, ddY)).T, dev_data_onehot.toarray() ))
complete_test = np.hstack(( np.vstack(( ttdX, ttdY)).T, test_data_onehot.toarray() ))

print complete_test

In [None]:
#complete_train[0:]

In [None]:
# params = {'C':[.001,.01,.1,.5,1,2,3,4,5,10]}

# gscv = GridSearchCV(LogisticRegression(), params)
# gscv.fit(complete_train, train_labels)

# print gscv.best_params_
# print gscv.best_score_

lr = LogisticRegression(C=.01)
lr_model = lr.fit_transform(complete_train, train_labels)
print lr.score(complete_dev, dev_labels)