In [35]:
#Import libraries
from datetime import datetime
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [36]:
crime = pd.read_csv("../../data/train.csv")

In [39]:
#Recode weekday variable
weekday_mapping =  {'Monday': 1,
                    'Tuesday': 2,
                    'Wednesday': 3,
                    'Thursday': 4,
                    'Friday': 5,
                    'Saturday': 6,
                    'Sunday': 7}
crime['DayOfWeek']=crime['DayOfWeek'].map(weekday_mapping)

#Recode PdDistrict variable
enc = preprocessing.LabelEncoder()
district = enc.fit_transform(crime.PdDistrict)
address = enc.fit_transform(crime.Address)
crime['District'] = district
crime['AddressCode'] = address

#Scale the coordinates variables
#stdsc = StandardScaler()
#crime['X_stdsc'] = stdsc.fit_transform(crime['X'])
#crime['Y_stdsc'] = stdsc.fit_transform(crime['Y'])


#Extract date and time information from 'Dates' variable
crime['Dates'] = pd.to_datetime(crime['Dates'])
crime['Year'] = crime['Dates'].dt.year
crime['Month'] = crime['Dates'].dt.month
crime['Day'] = crime['Dates'].dt.day
crime['Hour'] = crime['Dates'].dt.hour
crime['Minute'] = crime['Dates'].dt.minute

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

#Shuffle data
crime = crime.reindex(np.random.permutation(crime.index))

In [38]:
#Self-defined multiclass log loss function
def llfun(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

X = crime[['Year', 'Month', 'Day', 'Hour', 'DayOfWeek', 'District', 'AddressCode', 'X','Y']]
Y = crime.Category
test_data, test_labels = X[710000:], Y[710000:]
dev_data, dev_labels = X[700000:710000], Y[700000:710000]
train_data, train_labels = X[:700000], Y[:700000]

def RF():
    #Random forest 
    rf = RandomForestClassifier(n_estimators=100, max_depth=20)
    rf.fit(train_data, train_labels)
    rf_prob = rf.predict_proba(test_data)
    
    #Set labels to dummy format
    test_labels_long = pd.get_dummies(test_labels)
    test_labels_long = pd.DataFrame.as_matrix(test_labels_long)
    logloss_rolled = llfun(test_labels_long, rf_prob)
    logloss = log_loss(test_labels_long, rf_prob)    
    
    #Compare results from self-defined log loss function and one from sklearn.metrics
    print ('Accuracy: ' ,rf.score(test_data,test_labels))
    print ('Log loss(rolled): ', sum(logloss_rolled))
    print ('Log loss: ', logloss)    
    
RF()

Accuracy:  0.322703497194
Log loss(rolled):  3.25323830296
Log loss:  2.35737891382
