# 1. Read data

In [163]:
import numpy as np
import pandas as pd
train_set = pd.read_csv('train.csv', parse_dates=['Dates'])
test_set = pd.read_csv('test.csv', parse_dates=['Dates'])

In [164]:
train_set.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [165]:
test_set.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


# 2. Handle categorical data

In [166]:
from sklearn import preprocessing
from sklearn import cross_validation
le_category = preprocessing.LabelEncoder()
category = le_category.fit_transform(train_set.Category)

In [167]:
day_of_week = pd.get_dummies(train_set.DayOfWeek)

district = pd.get_dummies(train_set.PdDistrict)

hour = train_set.Dates.dt.hour
hour = pd.get_dummies(hour)
hour.columns = ['h_' + x for x in hour.columns.astype(str)]

month = train_set.Dates.dt.month
month = pd.get_dummies(month)
month.columns = ['m_' + x for x in month.columns.astype(str)]

day_of_month = train_set.Dates.dt.day
day_of_month = pd.get_dummies(day_of_month)
day_of_month.columns = ['d_' + x for x in day_of_month.columns.astype(str)]

year = train_set.Dates.dt.year
year = pd.get_dummies(year)

train = pd.concat([day_of_week, district, hour, month, day_of_month, year], axis=1)

In [168]:
train['rain_1'] = train.m_6 + train.m_7 + train.m_8 + train.m_9
train['rain_2'] = train.m_5 + train.m_10
train['rain_3'] = train.m_3
train['rain_4'] = train.m_1 + train.m_2 +  train.m_12

In [169]:
train['Category'] = category

In [170]:
day_of_week = pd.get_dummies(test_set.DayOfWeek)

district = pd.get_dummies(test_set.PdDistrict)

hour = test_set.Dates.dt.hour
hour = pd.get_dummies(hour)
hour.columns = ['h_' + x for x in hour.columns.astype(str)]

month = test_set.Dates.dt.month
month = pd.get_dummies(month)
month.columns = ['m_' + x for x in month.columns.astype(str)]

day_of_month = test_set.Dates.dt.day
day_of_month = pd.get_dummies(day_of_month)
day_of_month.columns = ['d_' + x for x in day_of_month.columns.astype(str)]

year = test_set.Dates.dt.year
year = pd.get_dummies(year)

test = pd.concat([day_of_week, district, hour, month, day_of_month, year], axis=1)

In [171]:
test['rain_1'] = test.m_6 + test.m_7 + test.m_8 + test.m_9
test['rain_2'] = test.m_5 + test.m_10
test['rain_3'] = test.m_3
test['rain_4'] = test.m_1 + test.m_2 +  test.m_12

In [172]:
train_data, validation_data = cross_validation.train_test_split(train, train_size = 0.7)

In [173]:
features = train_data.columns.tolist()
features.remove('Category')

# 3. Using BernoulliNB

In [174]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss
clf = BernoulliNB()
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
log_loss(validation_data['Category'], predicted) 

ValueError: cannot label index with a null key

In [120]:
predicted = clf.predict_proba(test[features])
result=pd.DataFrame(predicted, columns=le_category.classes_)
result.to_csv('sfcrime_v2_result.csv', index = True, index_label = 'Id' )