First attempt at pulling data from Kaggle and recoding categorical variables of interest into dummy variables.

In [3]:
%matplotlib inline

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data Structure
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.neural_network import MLPClassifier

# Others
from datetime import datetime



In [4]:
# Load the data and examine it
crime_data = pd.read_csv("./train.csv")
print "Shape of crime_data:", crime_data.shape
crime_data.head()

Shape of crime_data: (878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
test_data = pd.read_csv("./test.csv")
print "Shape of test_data:", test_data.shape
test_data.head()

Shape of test_data: (884262, 7)


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [115]:
# At the second step, we want to see the basic info of our data.
crime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [10]:
# Show all available crime lables.
print crime_data.groupby('Category').count()

                              Dates  Descript  DayOfWeek  PdDistrict  \
Category                                                               
ARSON                          1513      1513       1513        1513   
ASSAULT                       76876     76876      76876       76876   
BAD CHECKS                      406       406        406         406   
BRIBERY                         289       289        289         289   
BURGLARY                      36755     36755      36755       36755   
DISORDERLY CONDUCT             4320      4320       4320        4320   
DRIVING UNDER THE INFLUENCE    2268      2268       2268        2268   
DRUG/NARCOTIC                 53971     53971      53971       53971   
DRUNKENNESS                    4280      4280       4280        4280   
EMBEZZLEMENT                   1166      1166       1166        1166   
EXTORTION                       256       256        256         256   
FAMILY OFFENSES                 491       491        491        

In [65]:
# Convert the date into a python datetime object.
crime_data["Dates"] = pd.to_datetime(crime_data["Dates"], format="%Y-%m-%d %H:%M:%S")
test_data["Dates"] = pd.to_datetime(test_data["Dates"], format="%Y-%m-%d %H:%M:%S")

In [66]:
# Add 24 binary features representing the happening hour of the crime.
hour_map = {0:'Zero', 1:'One', 2:'Two', 3:'Three', 4:'Four', 5:'Five', 6:'Six', 7:'Seven', 8:'Eight', 9:'Nine', 
            10:'Ten', 11:'Eleven', 12:'Twelve', 13:'Thirteen', 14:'Fourteen', 15:'Fifteen', 16:'Sixteen', 
            17:'Seventeen', 18:'Eighteen', 19:'Nineteen', 20:'Twenty', 21:'TwentyOne', 22:'TwentyTwo', 
            23:'TwentyThree'}

crime_data["Hour"] = crime_data["Dates"].apply(lambda x: x.hour)
for hour in crime_data["Hour"].unique():
    crime_data[hour_map[hour]] = pd.Series(crime_data["Hour"] == hour, dtype=int)
     
test_data["Hour"] = test_data["Dates"].apply(lambda x: x.hour)
for hour in test_data["Hour"].unique():
    test_data[hour_map[hour]] = pd.Series(test_data["Hour"] == hour, dtype=int)
    
crime_data.iloc[:5,-24:]

Unnamed: 0,TwentyThree,TwentyTwo,TwentyOne,Twenty,Nineteen,Eighteen,Seventeen,Sixteen,Fifteen,Fourteen,...,Nine,Eight,Seven,Six,Five,Four,Three,Two,One,Zero
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# Add 12 binary features representing the happening month of the crime.
month_map = {1:'January', 2:'February ', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September',
             10:'October', 11:'November', 12:'December'}

crime_data["Month"] = crime_data["Dates"].apply(lambda x: x.month)
for month in crime_data["Month"].unique():
    crime_data[month_map[month]] = pd.Series(crime_data["Month"] == month, dtype=int)
    
test_data["Month"] = test_data["Dates"].apply(lambda x: x.month)
for month in test_data["Month"].unique():
    test_data[month_map[month]] = pd.Series(test_data["Month"] == month, dtype=int)
    
crime_data.iloc[:5,-12:]

Unnamed: 0,May,April,March,February,January,December,November,October,September,August,July,June
0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0


In [68]:
# Add 12 binary features representing the happening year of the crime.
crime_data["Year"] = crime_data["Dates"].apply(lambda x: x.year)
for year in crime_data["Year"].unique():
    crime_data[year] = pd.Series(crime_data["Year"] == year, dtype=int)
    
test_data["Year"] = test_data["Dates"].apply(lambda x: x.year)
for year in test_data["Year"].unique():
    test_data[year] = pd.Series(test_data["Year"] == year, dtype=int)

crime_data.iloc[:5,-13:]

Unnamed: 0,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003
0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0


In [69]:
# Add 7 binary features representing the happening date of week of the crime.
for day_of_week in crime_data["DayOfWeek"].unique():
    crime_data[day_of_week] = pd.Series(crime_data["DayOfWeek"] == day_of_week, dtype=int)
    
for day_of_week in test_data["DayOfWeek"].unique():
    test_data[day_of_week] = pd.Series(test_data["DayOfWeek"] == day_of_week, dtype=int)
    
# Add 10 binary features representing the district of the crime.
for district in crime_data["PdDistrict"].unique():
    crime_data[district] = pd.Series(crime_data["PdDistrict"] == district, dtype=int)
    
for district in test_data["PdDistrict"].unique():
    test_data[district] = pd.Series(test_data["PdDistrict"] == district, dtype=int)

In [70]:
# Drop the unnecessary feature columns from the train data.
crime_data = crime_data.drop(['Hour', 'Month', 'Year', 'Dates', 'DayOfWeek', 'PdDistrict', 
                              'Descript', 'Resolution', 'Address', 'X', 'Y' ], axis=1)
crime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 67 columns):
Category       878049 non-null object
TwentyThree    878049 non-null int64
TwentyTwo      878049 non-null int64
TwentyOne      878049 non-null int64
Twenty         878049 non-null int64
Nineteen       878049 non-null int64
Eighteen       878049 non-null int64
Seventeen      878049 non-null int64
Sixteen        878049 non-null int64
Fifteen        878049 non-null int64
Fourteen       878049 non-null int64
Thirteen       878049 non-null int64
Twelve         878049 non-null int64
Eleven         878049 non-null int64
Ten            878049 non-null int64
Nine           878049 non-null int64
Eight          878049 non-null int64
Seven          878049 non-null int64
Six            878049 non-null int64
Five           878049 non-null int64
Four           878049 non-null int64
Three          878049 non-null int64
Two            878049 non-null int64
One            878049 non-null int64


In [26]:
# Before we train the models, we need to divide the data into train data and dev data.
shuffle = np.random.permutation(np.arange(crime_data.shape[0]))

crime_data = crime_data.iloc[shuffle]

train_data = crime_data.iloc[:700000].drop('Category', axis=1)
train_labels = crime_data.iloc[:700000]['Category']

dev_data = crime_data.iloc[700000:].drop('Category', axis=1)
dev_labels = crime_data.iloc[700000:]['Category']

mini_train_data = crime_data.iloc[:10000].drop('Category', axis=1)
mini_train_labels = crime_data.iloc[:10000]['Category']

mini_dev_data = crime_data.iloc[10000:11000].drop('Category', axis=1)
mini_dev_labels = crime_data.iloc[10000:11000]['Category']

print crime_data.shape, train_data.shape, train_labels.shape, dev_data.shape, dev_labels.shape
print mini_train_data.shape, mini_train_labels.shape, mini_dev_data.shape, mini_dev_labels.shape

(878049, 67) (700000, 66) (700000,) (178049, 66) (178049,)
(10000, 66) (10000,) (1000, 66) (1000,)


In [33]:
# Now we train some most common models to establish the baseline.

# Start with the Logistic Regression
logreg = LogisticRegression()
logreg.fit(mini_train_data, mini_train_labels)
pred_labels = logreg.predict(mini_dev_data)
logreg_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Logistic Regression model is %.3f" % (logreg_accuracy)

The accuracy of Logistic Regression model is 0.238


In [34]:
# Support Vector Machines
svc = SVC()
svc.fit(mini_train_data, mini_train_labels)
pred_labels = svc.predict(mini_dev_data)
svc_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Support Vector Machines model is %.3f" % (svc_accuracy)

The accuracy of Support Vector Machines model is 0.228


In [35]:
# K Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(mini_train_data, mini_train_labels)
pred_labels = knn.predict(mini_dev_data)
knn_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of K Nearest Neighbors model is %.3f" % (knn_accuracy)

The accuracy of K Nearest Neighbors model is 0.123


In [36]:
# Naive Bayes
gaussian = GaussianNB()
gaussian.fit(mini_train_data, mini_train_labels)
pred_labels = gaussian.predict(mini_dev_data)
nb_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Naive Bayes model is %.3f" % (nb_accuracy)

The accuracy of Naive Bayes model is 0.003


In [37]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(mini_train_data, mini_train_labels)
pred_labels = decision_tree.predict(mini_dev_data)
dt_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Decision Tree model is %.3f" % (dt_accuracy)

The accuracy of Decision Tree model is 0.125


In [38]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(mini_train_data, mini_train_labels)
pred_labels = random_forest.predict(mini_dev_data)
rf_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Random Forest model is %.3f" % (rf_accuracy)

The accuracy of Random Forest model is 0.183


In [41]:
# Neural Network
ann = MLPClassifier()
ann.fit(mini_train_data, mini_train_labels)
pred_labels = ann.predict(mini_dev_data)
ann_accuracy = np.mean(mini_dev_labels == pred_labels)
print "The accuracy of Neural Network model is %.3f" % (ann_accuracy)

The accuracy of Neural Network model is 0.179




In [42]:
# Display the rank of the models
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Neural Network', 'Decision Tree'],
    'Accuracy': [svc_accuracy, knn_accuracy, logreg_accuracy, 
              rf_accuracy, nb_accuracy, ann_accuracy, dt_accuracy]})
models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Model
2,0.238,Logistic Regression
0,0.228,Support Vector Machines
3,0.183,Random Forest
5,0.179,Neural Network
6,0.125,Decision Tree
1,0.123,KNN
4,0.003,Naive Bayes


In [44]:
# We will use the Logistic Regression to establish the baseline.

# Search for the optimal C value
cv = GridSearchCV(LogisticRegression(), {'C' : [0.001, 0.01, 0.1, 0.5, 1.0, 2.0]})
cv.fit(mini_train_data, mini_train_labels)
optimal_c = cv.best_params_['C']
print 'The optimal C value is: ', optimal_c

# Retrain the Logistic Regression model with full train data
logreg = LogisticRegression(C=optimal_c)
logreg.fit(train_data, train_labels)

The optimal C value is:  0.1


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
# Display the prediction accuracy with full dev data
pred_labels = logreg.predict(dev_data)
logreg_accuracy = np.mean(dev_labels == pred_labels)
print "The accuracy of Logistic Regression model is %.3f" % (logreg_accuracy)

The accuracy of Logistic Regression model is 0.228


In [71]:
# Import the test data and use the selected model to predict
test_pred = logreg.predict(test_data.drop(['Hour', 'Month', 'Year', 'Dates', 'DayOfWeek', 'PdDistrict', 
                            'Id', 'Address', 'X', 'Y' ], axis=1))