# Loading Library

In [163]:
# pandas
import pandas as pd

# numpy
import numpy as np

# defaultcit
from collections import defaultdict

# plot with folium
import folium
from IPython.core.display import HTML

# parsing time
from datetime import datetime, timedelta

# plotting with matplotlib
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

# plotting with pylab
import pylab as P

# Clustering
from sklearn.cluster import MiniBatchKMeans, KMeans
import time
import json

# change prediction categories into labels
from sklearn import preprocessing
from sklearn import cross_validation 
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
import re

from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

## Loading Training/Testing Data

** subset to only the fields we need and used in submission (Dates, DayOfWeek, PdDistrict, Address, X, Y, Category) **

In [164]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")
# train_df = train_df.sample(n = 100000, random_state = 666)

In [165]:
train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)
# train_df = train_df.reset_index(drop=True)
train_df = train_df[['Dates','DayOfWeek','PdDistrict','Address','X','Y','Category']]
train_df.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y,Category
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,WARRANTS
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,OTHER OFFENSES
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,OTHER OFFENSES
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873,LARCENY/THEFT
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541,LARCENY/THEFT


In [166]:
test_df.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [167]:
#Look at the shape
print(train_df.shape)
print(test_df.shape)

(878049, 7)
(884262, 7)


## Feature Creation

** create feature for training data and also for test data **

** difference btw train/test is that training has category data **

In [168]:
def createFeature(train_df):
    #Getting Month of Year, Day of Month, Hour of Day, and Minute of Hour
    month_of_year = []
    day_of_month =[]
    hour_of_day =[]
    min_of_hour =[]
    for i in range(len(train_df.Dates.values)):
        moy = datetime.strptime(train_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").month
        dom = datetime.strptime(train_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").day
        hod = datetime.strptime(train_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").hour
        moh = datetime.strptime(train_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").minute
        month_of_year.append(moy)
        day_of_month.append(dom)
        hour_of_day.append(hod)
        min_of_hour.append(moh)
    train_df['month_of_year'] = month_of_year #Month of the Year feature added
    train_df['day_of_month'] = day_of_month # Day of Month feature added
    train_df['hour_of_day'] = hour_of_day # Hour of Day feature added
    train_df['min_of_hour'] = min_of_hour # Minute of Hour Feature added
    
    #Creating Weeekday/Weekended Feature
    train_df['WeekdayWeeekend'] = train_df['DayOfWeek'].map( {'Monday': 'Weekday', 'Tuesday': 'Weekday', \
                                                          'Wednesday': 'Weekday', 'Thursday': 'Weekday', 'Friday': 'Weekend',\
                                                         'Saturday': 'Weekend', 'Sunday': 'Weekday'} ).astype(object)
    # Creating Midnight/Morning/Afternoon/Night Column
    train_df['TimeOfDay'] = train_df['hour_of_day'].map({0: 'Midnight', 1: 'Midnight', 2:'Midnight', 3:'Midnight', 4:'Morning', \
                                                     5:'Morning', 6:'Morning', 7:'Morning', 8:'Morning', 9:'Morning', \
                                                     10:'Morning', 11:'Morning', 12:'Afternoon', 13:'Afternoon', 14:'Afternoon', \
                                                     15:'Afternoon', 16:'Afternoon', 17:'Afternoon', 18:'Night', 19:'Night', \
                                                     20:'Night', 21:'Night', 22:'Midnight', 23:'Midnight'}).astype(object)
    # Creating Season Feature
    train_df['Season'] = train_df['month_of_year'].map({1: 'Winter', 2: 'Winter', 3:'Spring', 4:'Spring', 5:'Spring', \
                                                     6:'Spring', 7:'Summer', 8:'Summer', 9:'Summer', 10:'Autumn', \
                                                     11:'Autumn', 12:'Winter'}).astype(object)

    
    train_df['District_Type'] = train_df['PdDistrict'].map({'PARK': 'Other', 'CENTRAL': 'Other', 'MISSION': 'Corner', 'NORTHERN': 'Corner', 
                                                              'TENDERLOIN': 'Other', 'INGLESIDE': 'Street', 'TARAVAL': 'Street', 
                                                              'SOUTHERN': 'Other', 'BAYVIEW': 'Other', 'RICHMOND': 'Other'}).astype(object)

        
    #Deleting features not needed
#     train_df = train_df.drop('Descript', 1)
#     train_df = train_df.drop('Resolution', 1)
    train_df = train_df.drop('Address', 1)
    
    #Creating Dummy Variables
    WeekdayWeekend_dummies = pd.get_dummies(train_df.WeekdayWeeekend)
    TimeOfDay_dummies = pd.get_dummies(train_df.TimeOfDay)
    season_dummies = pd.get_dummies(train_df.Season)
    district_dummies = pd.get_dummies(train_df.PdDistrict)
    week_dummies = pd.get_dummies(train_df.DayOfWeek)
#     type_dummies = pd.get_dummies(train_df.Crime_Type)
    corner_dummies = pd.get_dummies(train_df.District_Type)
    
    train_df_new = pd.concat([train_df, WeekdayWeekend_dummies, TimeOfDay_dummies, season_dummies, district_dummies, week_dummies, corner_dummies], axis=1, join_axes=[train_df.index])
    print('Sanity Check')
    print(train_df.shape)
    print(WeekdayWeekend_dummies.shape)
    print(TimeOfDay_dummies.shape)
    print(season_dummies.shape)
    print(district_dummies.shape)
    print(week_dummies.shape)
#     print(type_dummies.shape)
    print(corner_dummies.shape)
    print(train_df_new.shape)
    print('Make sure the total adds up to the last number')
    
    le_crime = preprocessing.LabelEncoder()
    crime = le_crime.fit_transform(train_df_new.Category)
    train_df_new['dummy_Category'] = crime

    return train_df_new
#Calling the feature creation function
train_df_new = createFeature(train_df)
train_df_new.head()

Sanity Check
(878049, 14)
(878049, 2)
(878049, 4)
(878049, 4)
(878049, 10)
(878049, 7)
(878049, 3)
(878049, 44)
Make sure the total adds up to the last number


Unnamed: 0,Dates,DayOfWeek,PdDistrict,X,Y,Category,month_of_year,day_of_month,hour_of_day,min_of_hour,...,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Corner,Other,Street,dummy_Category
0,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599,WARRANTS,5,13,23,53,...,0,0,0,0,0,1,1,0,0,37
1,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599,OTHER OFFENSES,5,13,23,53,...,0,0,0,0,0,1,1,0,0,21
2,2015-05-13 23:33:00,Wednesday,NORTHERN,-122.424363,37.800414,OTHER OFFENSES,5,13,23,33,...,0,0,0,0,0,1,1,0,0,21
3,2015-05-13 23:30:00,Wednesday,NORTHERN,-122.426995,37.800873,LARCENY/THEFT,5,13,23,30,...,0,0,0,0,0,1,1,0,0,16
4,2015-05-13 23:30:00,Wednesday,PARK,-122.438738,37.771541,LARCENY/THEFT,5,13,23,30,...,0,0,0,0,0,1,0,1,0,16


** create feature for test data **

*difference is that Cateogry doesn't exist in test data*

In [169]:
def test_createFeature(test_df):
    #Getting Month of Year, Day of Month, Hour of Day, and Minute of Hour
    month_of_year = []
    day_of_month =[]
    hour_of_day =[]
    min_of_hour =[]
    for i in range(len(test_df.Dates.values)):
        moy = datetime.strptime(test_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").month
        dom = datetime.strptime(test_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").day
        hod = datetime.strptime(test_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").hour
        moh = datetime.strptime(test_df.Dates.values[i], "%Y-%m-%d %H:%M:%S").minute
        month_of_year.append(moy)
        day_of_month.append(dom)
        hour_of_day.append(hod)
        min_of_hour.append(moh)
    test_df['month_of_year'] = month_of_year #Month of the Year feature added
    test_df['day_of_month'] = day_of_month # Day of Month feature added
    test_df['hour_of_day'] = hour_of_day # Hour of Day feature added
    test_df['min_of_hour'] = min_of_hour # Minute of Hour Feature added
    
    #Creating Weeekday/Weekended Feature
    test_df['WeekdayWeeekend'] = test_df['DayOfWeek'].map( {'Monday': 'Weekday', 'Tuesday': 'Weekday', \
                                                          'Wednesday': 'Weekend', 'Thursday': 'Weekend', 'Friday': 'Weekend',\
                                                         'Saturday': 'Weekend', 'Sunday': 'Weekday'} ).astype(object)
    # Creating Midnight/Morning/Afternoon/Night Column
    test_df['TimeOfDay'] = test_df['hour_of_day'].map({0: 'Night', 1: 'Midnight', 2:'Midnight', 3:'Midnight', 4:'Midnight', \
                                                     5:'Midnight', 6:'Midnight', 7:'Midnight', 8:'Morning', 9:'Morning', \
                                                     10:'Morning', 11:'Morning', 12:'Afternoon', 13:'Afternoon', 14:'Afternoon', \
                                                     15:'Afternoon', 16:'Afternoon', 17:'Afternoon', 18:'Afternoon', 19:'Afternoon', \
                                                     20:'Night', 21:'Night', 22:'Night', 23:'Night'}).astype(object)
    # Creating Season Feature
    test_df['Season'] = test_df['month_of_year'].map({1: 'Winter', 2: 'Winter', 3:'Spring', 4:'Spring', 5:'Spring', \
                                                     6:'Spring', 7:'Summer', 8:'Summer', 9:'Summer', 10:'Autumn', \
                                                     11:'Autumn', 12:'Winter'}).astype(object)

    
    
    test_df['District_Type'] = test_df['PdDistrict'].map({'PARK': 'Other', 'CENTRAL': 'Other', 'MISSION': 'Corner', 'NORTHERN': 'Corner', 
                                                              'TENDERLOIN': 'Other', 'INGLESIDE': 'Street', 'TARAVAL': 'Street', 
                                                              'SOUTHERN': 'Other', 'BAYVIEW': 'Other', 'RICHMOND': 'Other'}).astype(object)

        
    #Deleting features not needed
#     test_df = test_df.drop('Descript', 1)
#     test_df = test_df.drop('Resolution', 1)
    test_df = test_df.drop('Address', 1)
    
    #Creating Dummy Variables
    WeekdayWeekend_dummies = pd.get_dummies(test_df.WeekdayWeeekend)
    TimeOfDay_dummies = pd.get_dummies(test_df.TimeOfDay)
    season_dummies = pd.get_dummies(test_df.Season)
    district_dummies = pd.get_dummies(test_df.PdDistrict)
    week_dummies = pd.get_dummies(test_df.DayOfWeek)
#     type_dummies = pd.get_dummies(test_df.Crime_Type)
    corner_dummies = pd.get_dummies(test_df.District_Type)
    
    test_df_new = pd.concat([test_df, WeekdayWeekend_dummies, TimeOfDay_dummies, season_dummies, district_dummies, week_dummies, corner_dummies], axis=1, join_axes=[test_df.index])
    print('Sanity Check')
    print(test_df.shape)
    print(WeekdayWeekend_dummies.shape)
    print(TimeOfDay_dummies.shape)
    print(season_dummies.shape)
    print(district_dummies.shape)
    print(week_dummies.shape)
#     print(type_dummies.shape)
    print(corner_dummies.shape)
    print(test_df_new.shape)
    print('Make sure the total adds up to the last number')
    
#     le_crime = preprocessing.LabelEncoder()
#     crime = le_crime.fit_transform(train_df_new.Category)
#     train_df_new['dummy_Category'] = crime

    return test_df_new
#Calling the feature creation function

test_df_new = test_createFeature(test_df)
test_df_new.head()

Sanity Check
(884262, 14)
(884262, 2)
(884262, 4)
(884262, 4)
(884262, 10)
(884262, 7)
(884262, 3)
(884262, 44)
Make sure the total adds up to the last number


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,X,Y,month_of_year,day_of_month,hour_of_day,min_of_hour,...,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Corner,Other,Street
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,-122.399588,37.735051,5,10,23,59,...,0,0,0,1,0,0,0,0,1,0
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,-122.391523,37.732432,5,10,23,51,...,0,0,0,1,0,0,0,0,1,0
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,-122.426002,37.792212,5,10,23,50,...,0,0,0,1,0,0,0,1,0,0
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,-122.437394,37.721412,5,10,23,45,...,0,0,0,1,0,0,0,0,0,1
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,-122.437394,37.721412,5,10,23,45,...,0,0,0,1,0,0,0,0,0,1


#Exploring with Street Corner Feature

In [170]:
test_df_new.shape

(884262, 44)

In [171]:
train_df_new.columns

Index(['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y', 'Category',
       'month_of_year', 'day_of_month', 'hour_of_day', 'min_of_hour',
       'WeekdayWeeekend', 'TimeOfDay', 'Season', 'District_Type', 'Weekday',
       'Weekend', 'Afternoon', 'Midnight', 'Morning', 'Night', 'Autumn',
       'Spring', 'Summer', 'Winter', 'BAYVIEW', 'CENTRAL', 'INGLESIDE',
       'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL',
       'TENDERLOIN', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', 'Corner', 'Other', 'Street', 'dummy_Category'],
      dtype='object')

In [172]:
test_df_new.columns

Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y', 'month_of_year',
       'day_of_month', 'hour_of_day', 'min_of_hour', 'WeekdayWeeekend',
       'TimeOfDay', 'Season', 'District_Type', 'Weekday', 'Weekend',
       'Afternoon', 'Midnight', 'Morning', 'Night', 'Autumn', 'Spring',
       'Summer', 'Winter', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
       'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN',
       'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday', 'Corner', 'Other', 'Street'],
      dtype='object')

##Prediction

### Feature Inclusion/Exclusion

In [173]:
# Lat/Long
X_feature = (False, 'X')
Y_feature = (False, 'Y')

# Time
Month_feature = (False, 'month_of_year')
Day_feature = (True, 'day_of_month')
Hour_feature = (True, 'hour_of_day')
Min_feature = (True, 'min_of_hour')

# Day of Week
Friday_feature = (True, 'Friday') 
Monday_feature = (True, 'Monday')
Saturday_feature = (True, 'Saturday')
Sunday_feature = (True, 'Sunday')
Thursday_feature = (True, 'Thursday')
Tuesday_feature = (True, 'Tuesday')
Wednesday_feature = (True, 'Wednesday')

#Weekday/Weekend
Weekday_feature = (False, 'Weekday') 
Weekend_feature = (False, 'Weekend') 

#Season 
Autumn_feature = (False, 'Autumn') 
Spring_feature = (False, 'Spring') 
Summer_feature = (False, 'Summer') 
Winter_feature = (False, 'Winter') 

#Time of Day
Midnight_feature = (True, 'Midnight') 
Morning_feature = (True, 'Morning') 
Afternoon_feature = (True, 'Afternoon') 
Night_feature = (True, 'Night') 

# District
BAYV_feature = (True, 'BAYVIEW')
CENT_feature = (True, 'CENTRAL')
INGL_feature = (True, 'INGLESIDE')
MISS_feature = (True, 'MISSION')
NORT_feature = (True, 'NORTHERN')
PARK_feature = (True, 'PARK')
RICH_feature = (True, 'RICHMOND')
SOUT_feature = (True, 'SOUTHERN')
TARA_feature = (True, 'TARAVAL')
TEND_feature = (True, 'TENDERLOIN')

# # Crime Type
# Blue_feature = (False, 'Blue')
# White_feature = (False, 'White')
# Other_feature = (False, 'Other')

# District Type
Corner_feature = (False, 'Corner')
Street_feature = (False, 'Street')
District_Other_feature = (False, 'Other')

In [174]:
feature_list = [
                X_feature, Y_feature,
                Month_feature, Day_feature, Hour_feature, Min_feature,
                Monday_feature, Tuesday_feature, Wednesday_feature, Thursday_feature, Friday_feature, 
                Saturday_feature, Sunday_feature,
                Weekday_feature, Weekend_feature, 
                Autumn_feature, Spring_feature, Summer_feature, Winter_feature,
                Midnight_feature, Morning_feature, Afternoon_feature, Night_feature, 
                BAYV_feature, CENT_feature, INGL_feature, MISS_feature, NORT_feature, PARK_feature, 
                RICH_feature, SOUT_feature, TARA_feature, TEND_feature,
#                 Blue_feature, White_feature, Other_feature, 
                Corner_feature, Street_feature, District_Other_feature
               ]
features = [str(x[1]) for x in feature_list if x[0]]

In [175]:
# from sklearn.cross_validation import StratifiedShuffleSplit

# X = train_df_new[features]
# y = train_df_new['dummy_Category'].values

# sss = StratifiedShuffleSplit(y, 2, test_size=0.5, random_state=0)

# # for train_index, test_index in sss:
# #     X_train, X_test = X[train_index], X[test_index]
# #     y_train, y_test = y[train_index], y[test_index]

# Trainding and Validation

In [176]:
training, validation = train_test_split(train_df_new, train_size=.60)

In [177]:
# x_train, x_test, y_train, y_test = cross_validation.train_test_split(
#     train_df_new[features], train_df_new['dummy_Category'], test_size=0.2, random_state=0)

# rf = RandomForestRegressor()
# rf.fit(x_train, y_train)
# rf.score(x_test, y_test) 


# # predicted = np.array(rf.predict_proba(validation[features]))
# # log_loss(validation['dummy_Category'], predicted) 

### Logistic Regression

In [178]:
#Logistic Regression for comparison
model = LogisticRegression(C=.1)
model.fit(training[features], training['dummy_Category'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['dummy_Category'], predicted) 

2.5607826330505916

In [None]:
param_grid = {'C': [0.150, 0.125, 0.1, .075, .05] }

param_scores = ['precision', 'recall']



Haven't got the gridsearch on logistic regression working yet (possibly need to use less than full data

In [28]:
for score in param_scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
    clf.fit(training[features], training['dummy_Category'])

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print()
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = validation['dummy_Category'], clf.predict(training[features])
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 0.1}

Grid scores on development set:

0.227 (+/-0.015) for {'C': 0.15}
0.226 (+/-0.014) for {'C': 0.125}
0.227 (+/-0.015) for {'C': 0.1}
0.227 (+/-0.013) for {'C': 0.075}
0.227 (+/-0.009) for {'C': 0.05}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.





ValueError: Found arrays with inconsistent numbers of samples: [4000 6000]

## Bernoulli

In [179]:
# training, validation = train_test_split(train_df_new, train_size=.60)
model = BernoulliNB()
model.fit(training[features], training['dummy_Category'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [180]:
predicted = np.array(model.predict_proba(validation[features]))

In [181]:
validation['dummy_Category'].shape

(351220,)

In [182]:
log_loss(validation['dummy_Category'], predicted)

2.5491475085908735

** FOR SUBMISSION **

** need to train on full trainig set and use the reformatted test data to make prediction **

In [398]:
test_df_new = test_df_new.ix[:,1:len(test_df_new)]
test_df_new.shape

(884262, 41)

In [399]:
# training, validation = train_test_split(train_df_new, train_size=.60)
model = BernoulliNB()
model.fit(train_df_new[features], train_df_new['dummy_Category'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [400]:
predicted = np.array(model.predict_proba(test_df_new[features]))

In [402]:
ex_submission = pd.read_csv("kaggle_data/sampleSubmission.csv")

In [403]:
result=pd.DataFrame(predicted, columns=ex_submission.columns[1:len(ex_submission)])
# result.to_csv('testResult.csv', index = True, index_label = 'Id' )

## KNN

In [91]:
knn = KNeighborsClassifier(n_neighbors=50, weights='uniform')
knn.fit(training[features], training['dummy_Category']) 
knn_predicted = np.array(knn.predict_proba(validation[features]))
log_loss(validation['dummy_Category'], knn_predicted)

8.5881297183958729

---

# OLD STUFF THAT STILL MAY BE OF USE

# Bernoulli

In [410]:
Bernoulli_model = BernoulliNB()
Bernoulli_model.fit(x_train, y_train)
# predicted = Bernoulli_model.predict_proba(test_data[features])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [388]:
Bernoulli_model.score(x_test, y_test) 

0.21959455611867207

In [85]:
training, validation = train_test_split(train_df_new, train_size=.60)
model = BernoulliNB()
model.fit(training[features], training['dummy_Category'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['dummy_Category'], predicted) 

3.3551620366218122

In [499]:
training, validation = train_test_split(train_df_new, train_size=.60)
model = BernoulliNB()
model.fit(training[features], training['dummy_Category'])
predicted = np.array(model.predict_proba(test_df_new[features]))
log_loss(validation['dummy_Category'], predicted)  

ValueError: Found arrays with inconsistent numbers of samples: [351220 884262]

In [496]:
result=pd.DataFrame(predicted, columns=ex_submission.columns[1:len(ex_submission)])
# result.to_csv('testResult.csv', index = True, index_label = 'Id' )

In [497]:
len(result)

884262

In [395]:
model.score(validation[features], validation['dummy_Category']) 

0.22071351289789876

## SVM

**1) svm with single fold**

In [98]:
print(x_train.shape, x_test.shape)
print(y_train.shape,y_test.shape)

(8000, 25) (2000, 25)
(8000,) (2000,)


In [99]:
y_train.head()

822262    16
751738    35
590781     4
128821    35
875134    31
Name: dummy_Category, dtype: int64

In [100]:
#KNN - gives lowest error; does well with tons of data; ratio does not go to infiniti; data, ratio, (30 max)
# extra features on day: holiday; weekend; weekday; time of year; but do exploratory analyssi to find best features
# randomforest and decision tree will tell you the importance of feature
# 2D kernel density estimation

svc_model = svm.SVC(kernel = 'rbf', C=20, gamma=.0075).fit(x_train, y_train)

svc_model.score(x_test, y_test) 

0.246

**2) svm with crossfolds**

In [101]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(8000, 25) (8000,)
(2000, 25) (2000,)


In [102]:
x_train.columns

Index(['X', 'Y', 'hour_of_day', 'min_of_hour', 'Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Autumn',
       'Spring', 'Summer', 'Winter', 'BAYVIEW', 'CENTRAL', 'INGLESIDE',
       'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL',
       'TENDERLOIN'],
      dtype='object')

In [103]:
print(train_df_new[features].shape)
print(train_df_new['dummy_Category'].shape)

(10000, 25)
(10000,)


In [104]:
svm_2 = svm.SVC(kernel = 'rbf', C=50, gamma=.001)
scores = cross_validation.cross_val_score(
    svm_2, train_df_new[features], train_df_new['dummy_Category'], cv=5)
scores



array([ 0.24182359,  0.24327019,  0.23611806,  0.2345555 ,  0.25830816])

In [105]:
SVM_tuned_parameters = [{'kernel': ['rbf'],
                     'gamma': [.0075, .005, .001, .00075], 'C': [20, 30, 40, 50, 60, 70]}
                   ]

SVM_scores = ['precision', 'recall']

In [None]:
print(SVM_tuned_parameters)
print(SVM_scores)

In [None]:
for score in SVM_scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(svm.SVC(C=1), SVM_tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(x_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()

## KNN

In [None]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    train_df_new[features], train_df_new['dummy_Category'], test_size=0.2, random_state=0)

knn = KNeighborsClassifier(n_neighbors=56, weights='uniform')
knn.fit(x_train, y_train) 
knn.score(x_test, y_test) 

# KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
#            metric_params=None, n_jobs=1, n_neighbors=5, p=2,
#            weights='uniform')

# knn.predict(x_test)
# array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])
# y_test
# array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [None]:
KNN_tuned_parameters = [{'n_neighbors': [20, 25, 30, 35, 40, 45]}
                   ]

KNN_scores = ['precision', 'recall']

In [None]:
print(KNN_tuned_parameters)
print(KNN_scores)

In [None]:
for score in KNN_scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(knn, KNN_tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(x_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
n_neighbors, weights=weights