In [1]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

## Read in data

In [21]:
train = pd.read_csv("data/train.csv", parse_dates=['Dates'])
test = pd.read_csv("data/test.csv", parse_dates=['Dates'])

## Parse "Dates" into year, month, day, and hour

In [27]:
def feature_engineering(data):
    data['Day'] = data['Dates'].dt.day
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['DayOfWeek'] = data['Dates'].dt.dayofweek
    data['WeekOfYear'] = data['Dates'].dt.weekofyear
    return data

In [34]:
X_train = feature_engineering(train)
X_test = feature_engineering(test)

In [35]:
X_train = X_train.drop(['Descript','Resolution', 'Address', 'Category', 'Dates'], axis=1)
Y_train = train['Category']

X_test = X_test.drop(['Address', 'Dates'], axis=1)

## Several address don't have the correct latitude and longitude, need to impute them based on address

In [None]:
## to do

## 1. Convert categorical variables into dummy variables

In [72]:
one_hot = pd.get_dummies(X_train['PdDistrict'])
X_train = X_train.drop('PdDistrict', axis=1)
X_train = X_train.join(one_hot)

one_hot = pd.get_dummies(X_train['DayOfWeek'])
X_train = X_train.drop('DayOfWeek', axis=1)
X_train = X_train.join(one_hot)

In [74]:
one_hot = pd.get_dummies(X_test['PdDistrict'])
X_test = X_test.drop('PdDistrict', axis=1)
X_test = X_test.join(one_hot)

one_hot = pd.get_dummies(X_test['DayOfWeek'])
X_test = X_test.drop('DayOfWeek', axis=1)
X_test = X_test.join(one_hot)

## or 2. Encode predictors ( Either create dummy variables or encode)

In [76]:
enc = LabelEncoder()
X_train['PdDistrict'] = enc.fit_transform(X_train['PdDistrict'])
X_test['PdDistrict'] = enc.fit_transform(X_test['PdDistrict'])

## Encode response variable

In [None]:
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)

#to convert back
# y_train = le.inverse_transform(y_train)

## Extra Trees or Extremely Randomized Trees

In [None]:
etr = ExtraTreesClassifier(n_jobs=25, n_estimators = 100)
etr.fit(X_train, Y_train)

In [79]:
print etr.score(X_train, Y_train)

0.851773648168


In [80]:
score = cross_val_score(etr, X_train, Y_train, n_jobs=25, cv=5, scoring="accuracy")
print score
print np.mean(score)

[ 0.10215656  0.06978545]
0.0859710075596


In [81]:
X_test['predictions'] = etr.predict(X_test.ix[:,1:])
X_test['Category'] = le.inverse_transform(X_test['predictions'])

## Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=25, n_estimators = 100)
rf.fit(X_train, Y_train)
print rf.score(X_train, Y_train)

In [None]:
score = cross_val_score(rf, X_train, Y_train, n_jobs=25, cv=2, scoring="accuracy")
print score
print np.mean(score)

In [None]:
X_test['predictions'] = rf.predict(X_test.ix[:,1:])
X_test['Category'] = le.inverse_transform(X_test['predictions'])

## Output predictions to a csv file that's formatted as required by Kaggle

In [85]:
def field_to_columns(data, field, new_columns):
    for i in range(len(new_columns)):
        data[new_columns[i]] = (data[field] == new_columns[i]).astype(int)
    return data

In [101]:
categories = list(le.classes_)
X_test = field_to_columns(X_test, 'Category', categories)

# use list(X_test.columns[13:]) if you use encoding instead of creating dummies.
submission_cols = [X_test.columns[0]] + list(X_test.columns[22:])
X_test[submission_cols].to_csv("submission.csv", index = False)