In [1]:
%matplotlib inline

# General libraries.
from __future__ import division
from collections import Counter
import csv
import dateutil
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for preprocessing.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder # for integer values
from sklearn.feature_extraction import DictVectorizer as DV

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.mixture import GMM
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
data_path = "data"
submissions_path = "submissions"
if not data_path or not submissions_path:
    raise Exception("Set the data and submission paths in competition_utilities.py!")

def parse_date_maybe_null(date):
    if date:
        return dateutil.parser.parse(date)
    return None

df_converters = {"Dates": dateutil.parser.parse}

def get_reader(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return reader

def get_header(file_name="train.csv"):
    reader = csv.reader(open(os.path.join(data_path, file_name)))
    header = reader.next()
    return header

def get_dataframe(file_name="train.csv"):
    return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)

    
def write_submission(file_name, predictions):
    writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
    writer.writerows(predictions)    

In [3]:
get_header()

['Dates',
 'Category',
 'Descript',
 'DayOfWeek',
 'PdDistrict',
 'Resolution',
 'Address',
 'X',
 'Y']

In [4]:
dataframe = get_dataframe()

In [5]:
number_of_categories = dataframe['Category'].nunique()
category_names = sorted(dataframe['Category'].unique())
print number_of_categories
print category_names

39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [6]:
# for writing code, take a small sample so that it doesn't take forever to run
df = dataframe #.sample(10000)

In [7]:
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [8]:
# Split train data into train and dev 80:20, and separate into X,Y
# train_data = df
# shuffle the data:
df = df.reindex(np.random.permutation(df.index))

# take 80% from the top:
upper = np.floor(len(df)*.8).astype(int)
train_data = df.head(n=upper)
train_labels = train_data['Category']
train_data['Hour'] = (train_data['Dates']).dt.hour
train_data['Month'] = (train_data['Dates']).dt.month
train_data['Year'] = (train_data['Dates']).dt.year


# take 20% from the bottom
lower = np.ceil(len(df)*.2).astype(int)
dev_data = df.tail(n=lower)
dev_labels = dev_data['Category']
dev_data['Hour'] = (dev_data['Dates']).dt.hour
dev_data['Month'] = (dev_data['Dates']).dt.month
dev_data['Year'] = (dev_data['Dates']).dt.year


del train_data['Dates']
del train_data['Category']
del train_data['Descript']
del train_data['X']
del train_data['Y']
del train_data['Address']
del train_data['Resolution']


del dev_data['Dates']
del dev_data['Category']
del dev_data['Descript']
del dev_data['X']
del dev_data['Y']
del dev_data['Address']
del dev_data['Resolution']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
dev_data.head()
train_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,Hour,Month,Year
72868,Wednesday,BAYVIEW,0,5,2014
68384,Monday,INGLESIDE,0,6,2014
458574,Tuesday,RICHMOND,10,11,2008
501636,Friday,PARK,15,4,2008
523800,Tuesday,MISSION,12,12,2007


In [10]:
test_data = get_dataframe("test.csv")
print test_data.head()

   Id               Dates DayOfWeek PdDistrict                   Address  \
0   0 2015-05-10 23:59:00    Sunday    BAYVIEW   2000 Block of THOMAS AV   
1   1 2015-05-10 23:51:00    Sunday    BAYVIEW        3RD ST / REVERE AV   
2   2 2015-05-10 23:50:00    Sunday   NORTHERN    2000 Block of GOUGH ST   
3   3 2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   
4   4 2015-05-10 23:45:00    Sunday  INGLESIDE  4700 Block of MISSION ST   

            X          Y  
0 -122.399588  37.735051  
1 -122.391523  37.732432  
2 -122.426002  37.792212  
3 -122.437394  37.721412  
4 -122.437394  37.721412  


In [14]:
print test_data.head()

  DayOfWeek PdDistrict  Hour  Month  Year
0    Sunday    BAYVIEW    23      5  2015
1    Sunday    BAYVIEW    23      5  2015
2    Sunday   NORTHERN    23      5  2015
3    Sunday  INGLESIDE    23      5  2015
4    Sunday  INGLESIDE    23      5  2015


In [15]:
test_data['Hour'] = (test_data['Dates']).dt.hour
test_data['Month'] = (test_data['Dates']).dt.month
test_data['Year'] = (test_data['Dates']).dt.year
del test_data['Dates']
del test_data['X']
del test_data['Y']
del test_data['Address']
del test_data['Id']

In [12]:
test_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,Hour,Month,Year
0,Sunday,BAYVIEW,23,5,2015
1,Sunday,BAYVIEW,23,5,2015
2,Sunday,NORTHERN,23,5,2015
3,Sunday,INGLESIDE,23,5,2015
4,Sunday,INGLESIDE,23,5,2015


In [16]:
le = preprocessing.LabelEncoder()
days = train_data['DayOfWeek'].unique()
le.fit(days)
train_data['DayOfWeek'] = le.transform(train_data['DayOfWeek']) 

days = dev_data['DayOfWeek'].unique()
le.fit(days)
dev_data['DayOfWeek'] = le.transform(dev_data['DayOfWeek'])

days = test_data['DayOfWeek'].unique()
le.fit(days)
test_data['DayOfWeek'] = le.transform(test_data['DayOfWeek'])


district = train_data['PdDistrict'].unique()
le.fit(district)
train_data['PdDistrict'] = le.transform(train_data['PdDistrict']) 

district = dev_data['PdDistrict'].unique()
le.fit(district)
dev_data['PdDistrict'] = le.transform(dev_data['PdDistrict']) 

district = test_data['PdDistrict'].unique()
le.fit(district)
test_data['PdDistrict'] = le.transform(test_data['PdDistrict']) 


#print list(le.inverse_transform([2, 2, 1]))
#DayOfWeek, PdDistrict, Hour



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
enc = OneHotEncoder()
train_data_onehot = enc.fit_transform(train_data) 
dev_data_onehot = enc.transform(dev_data)
test_data_onehot = enc.fit_transform(test_data)

# print train_data_onehot.toarray()
print train_data_onehot.shape
# print dev_data_onehot.toarray()
print dev_data_onehot.shape
#print test_data_onehot.toarray()
print test_data_onehot.shape
#print enc.n_values_

#print enc.feature_indices_

#print enc.transform([[0, 1, 1]]).toarray()

(702439, 66)
(175610, 66)
(884262, 66)


In [18]:
classifier =  LogisticRegression()
model = classifier.fit(train_data_onehot,train_labels)
predictions = model.predict_proba(dev_data_onehot)
score = model.score(dev_data_onehot,dev_labels)    
print score

0.228910654291


In [19]:
# test data
predictions = model.predict_proba(test_data_onehot)
withId  = np.column_stack((map(str,xrange(test_data_onehot.shape[0])),predictions))
towrite = np.row_stack((["Id"] + sorted(train_labels.unique()),withId))

print(towrite)
write_submission('submission_file_LR.csv', towrite)

[['Id' 'ARSON' 'ASSAULT' ..., 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']
 ['0' '0.00739993175399' '0.125221745312' ..., '0.0936518997177'
  '0.036918163435' '0.0243869221666']
 ['1' '0.00739993175399' '0.125221745312' ..., '0.0936518997177'
  '0.036918163435' '0.0243869221666']
 ..., 
 ['884259' '0.00398567358023' '0.106557850803' ..., '0.124365559681'
  '0.0392961752788' '0.0101461380829']
 ['884260' '0.00812891135191' '0.11095525138' ..., '0.0909335490454'
  '0.0608660724559' '0.0134945389787']
 ['884261' '0.00353023900477' '0.082416785084' ..., '0.102964139636'
  '0.0301920826208' '0.00615703446792']]
