In [1]:
#import libraries
import numpy as np
import scipy as scp
import matplotlib as mpl
from sklearn import tree
from sklearn import cross_validation
from sklearn import metrics
from sklearn import calibration
import matplotlib.pyplot as plt
import pandas
from pandas import DataFrame
import gzip
import re
from sklearn.externals.six import StringIO
%matplotlib inline

In [3]:
train_data_frame = pandas.read_csv("data\\train.csv", sep=",", quotechar='"')
test = pandas.read_csv("data\\test.csv", sep=",", quotechar='"')

# Extract year, day and hour and create "Year", "Month", "Hour" columns in train_data_frame

def parse_date(date):
    """
    maps date formatted as in training set as a list of [Year, Month, Hour]
    :param date: date as formatted in training set
    :retuap oject that can extract of Year, Month and Hour
    """
    mo=re.search(r'^([0-9]{4})-([0-9]{2})-[0-9]{2}\s+([0-9]{2}):[0-9]{2}:[0-9]{2}$',date)
    timelist = map(int,(mo.group(1),mo.group(2),mo.group(3)))
    return map(int,(mo.group(1),mo.group(2),mo.group(3)))

# Extract 'Year', 'Month' and 'Hour' columns for later use
train_data_frame['Year'],train_data_frame['Month'],train_data_frame['Hour'] = zip(*train_data_frame.loc[:,'Dates'].map(parse_date))
test['Year'],test['Month'],test['Hour'] = zip(*test.loc[:,'Dates'].map(parse_date))

# Clean date : supress data outliers from dataframe
train_data_frame_ret = train_data_frame[train_data_frame.Y < 38]
#train_data_frame_ret.shape

# Create dictionnary for days of week with index as key
dayOfWeekList = np.unique(train_data_frame_ret.DayOfWeek)
indexDayList = list(range(len(dayOfWeekList)))
dayOfWeekDict = dict(zip(dayOfWeekList, indexDayList))

# Create dictionnary for days of week with index as key
districtList = np.unique(train_data_frame_ret.PdDistrict)
indexDistrictList = list(range(len(districtList)))
districtDict = dict(zip(districtList, indexDistrictList))

train_data_frame_ret.replace({"DayOfWeek": dayOfWeekDict}, inplace = True)
# train_data_frame_ret.replace({"PdDistrict": districtDict}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  regex=regex)


In [4]:
def convertToSubmittable(prediction):
    """
    Transforms a column of predictions as a number associated to a Crime Category (between 0 and 38)
    into a dataframe filled with 0s except 1 coding the predicted Crime Category in the corresponding column
    :param prediction: 1 dimensional ndarray with values in [0:38]
    :return submittable: 2 dimensional ndarray with {columns : Crime Categories, rows : Predictions}
    """
    submittable = np.zeros((len(prediction),len(crimeCategoryList)))
    for i in range(len(prediction)):
        submittable[i][prediction[i]] = 1
    return submittable

    
def prepend(filename, line):
    """
    adds line to first column of first row of csv filename
    :type filename: String
    :type line: String
    :param filename: path to csv file
    :param line: String to be added
    """
    # r+ allows file to be read and written
    with open(filename, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line.rstrip('\r\n')+ content)
        
def tocsv(submission,filename):
    """
    converts 2d dataframe or ndarray to csv file named filename
    :type submission: ndarray
    :param submision:that contains probabilities of belonging to various Crime Categories
    :param filename: path to csv file 
    """
    keys = np.unique(train_data_frame["Category"])
    DataFrame(dict(zip(keys, submission.T))).to_csv(filename)


def gzipcompress(filename):
    """
    compresses csv to gzip in same folder 
    :param filename: path to csv file
    """
    with open(filename, 'rb') as f_in:
        with gzip.open(filename[:-3]+"gz", 'wb') as f_out:
            f_out.writelines(f_in)

In [18]:
train_data_frame_ret.describe()

Unnamed: 0,DayOfWeek,X,Y,Year,Month,Hour
count,877982.0,877982.0,877982.0,877982.0,877982.0,877982.0
mean,2.990223,-122.422763,37.767035,2008.712378,6.436416,13.412737
std,2.026094,0.025285,0.024165,3.631126,3.428998,6.549521
min,0.0,-122.513642,37.707879,2003.0,1.0,0.0
25%,1.0,-122.432952,37.752427,2006.0,3.0,9.0
50%,3.0,-122.41642,37.775421,2009.0,6.0,14.0
75%,5.0,-122.406959,37.784368,2012.0,9.0,19.0
max,6.0,-122.364937,37.819975,2015.0,12.0,23.0


**Create a maximum size decision tree model**

features : Hour, PdDistrict, Month, Year and DayOfWeek

criterion : entropy

In [None]:
df=pandas.get_dummies(train_data_frame_ret[["Hour","PdDistrict","Month","Year","DayOfWeek"]])
print("dummies created !")

treeClassifier = tree.DecisionTreeClassifier(min_samples_split=1, criterion="entropy")
treeClassifier.fit(df,np.ravel(train_data_frame_ret[["Category"]]))

**Compute feature importance**

In [9]:
print(df.columns,treeClassifier.feature_importances_,treeClassifier.tree_)

Index(['Hour', 'Month', 'Year', 'DayOfWeek', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object') [ 0.19176748  0.37144943  0.13494443  0.25034721  0.00617428  0.00413198
  0.00477434  0.0063097   0.00367454  0.00170108  0.00139294  0.0052901
  0.00155421  0.01648829] <sklearn.tree._tree.Tree object at 0x000000000AA51030>


**Create a tree where min_sample_split is limited**

features : Hour, Month, PdDistrict, Year, DayOfWeek

min_sample_split : 1000

criterion : entropy

In [14]:
df=pandas.get_dummies(train_data_frame_ret[["Hour","Month","PdDistrict","Year","DayOfWeek"]])

treeClassifier = tree.DecisionTreeClassifier(min_samples_split=1000,
                                             criterion="entropy")
treeClassifier.fit(df,np.ravel(train_data_frame_ret[["Category"]]))

dummies created !
tree fitted 1000


**Infer feature importances**

In [15]:
print(df.columns, treeClassifier.feature_importances_, treeClassifier.tree_)

Index(['Hour', 'Month', 'Year', 'DayOfWeek', 'PdDistrict_BAYVIEW',
       'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE', 'PdDistrict_MISSION',
       'PdDistrict_NORTHERN', 'PdDistrict_PARK', 'PdDistrict_RICHMOND',
       'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN'],
      dtype='object') [ 0.27211998  0.06482294  0.19912504  0.06810939  0.0497262   0.03173351
  0.03814956  0.05105646  0.02895051  0.01152143  0.00351809  0.04280615
  0.00494173  0.133419  ] <sklearn.tree._tree.Tree object at 0x000000000AA51030>


## Evolution of feature importances with min_sample_split

**Compute feature importances for various min_sample_split**

features : X, Y, Hour, Year, Month, DayOfWeek, PdDistrict

criterion : entropy

In [27]:
for i in range(5,100,10):
    treeClassifier = tree.DecisionTreeClassifier(max_depth = i)
    treeClassifier.fit(train_data_frame_ret[["X","Y","Hour","Year","Month","DayOfWeek","PdDistrict"]],
                       train_data_frame_ret.Category)
    print(i, treeClassifier.feature_importances_)
    treeClassifier.export

5 [ 0.26366765  0.37196852  0.11979448  0.04772893  0.          0.
  0.19684042]
15 [ 0.29893681  0.30681342  0.11880525  0.10159854  0.07088091  0.05469953
  0.04826553]
25 [ 0.17471949  0.1773567   0.156609    0.13936903  0.19256236  0.14714102
  0.0122424 ]
35 [ 0.16859475  0.17185583  0.15906207  0.1391979   0.19871552  0.15191511
  0.01065881]
45 [ 0.16889298  0.17195007  0.15911528  0.13940831  0.19779193  0.15228168
  0.01055976]
55 [ 0.16876011  0.17181752  0.15919254  0.13941448  0.19815171  0.1520592
  0.01060443]
65 [ 0.16873902  0.17200152  0.15900586  0.13937397  0.19870958  0.15147672
  0.01069334]
75 [ 0.16846581  0.17188707  0.15884646  0.13938028  0.19846211  0.15238177
  0.0105765 ]
85 [ 0.16849553  0.1719533   0.15895956  0.13892878  0.19853714  0.15252678
  0.01059891]
95 [ 0.1690646   0.17190049  0.15949324  0.13916997  0.19806917  0.15175061
  0.01055191]


**Compute feature importances for various min_sample_split**

features : X, Y, Hour, Year, Month, DayOfWeek, PdDistrict as Dummy

criterion : entropy

In [36]:
df = pandas.get_dummies(train_data_frame_ret[["PdDistrict","Hour","DayOfWeek","Month","Year","X","Y"]],
                        columns=train_data_frame_ret[["PdDistrict"]])
for i in range(5,100,10):
    treeClassifier = tree.DecisionTreeClassifier(max_depth = i)
    treeClassifier.fit(df, train_data_frame_ret.Category)
    print(i,treeClassifier.feature_importances_)

5 [ 0.11979448  0.          0.          0.04772893  0.26366765  0.37196852
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.19684042]
15 [  1.18512971e-01   5.46754541e-02   7.10303606e-02   1.02438469e-01
   2.98354837e-01   3.06109460e-01   1.19253563e-04   6.79598694e-04
   2.02627927e-03   1.40682909e-03   8.73004409e-04   1.22401552e-03
   2.07334755e-03   7.10283159e-04   1.63436504e-04   3.96024001e-02]
25 [ 0.15718548  0.14689882  0.19226618  0.13852346  0.17466444  0.17677614
  0.00042757  0.00067943  0.00073478  0.00093186  0.00103542  0.00090624
  0.00070371  0.00107265  0.00037338  0.00682044]
35 [ 0.1590048   0.15134692  0.19810379  0.1395856   0.16841679  0.17132439
  0.00050762  0.00063315  0.00079811  0.00094083  0.00110004  0.00102358
  0.00060647  0.0010288   0.00038919  0.00518993]
45 [ 0.15906699  0.15169752  0.19800967  0.13914546  0.16803888  0.17157593
  0.00051106  0.00062926  0.0008449   0.00096525  0.0011

**Compute feature importances for various min_sample_split without location redundancy (X,Y)**

features : Hour, Year, Month, DayOfWeek, PdDistrict as Dummy

criterion : entropy

In [37]:
df = pandas.get_dummies(train_data_frame_ret[["PdDistrict","Hour","DayOfWeek","Month","Year"]],
                        columns=train_data_frame_ret[["PdDistrict"]])
for i in range(5,100,10):
    treeClassifier = tree.DecisionTreeClassifier(max_depth = i)
    treeClassifier.fit(df,train_data_frame_ret.Category)
    print(i,treeClassifier.feature_importances_)

5 [ 0.18154729  0.00114275  0.          0.1111282   0.14141142  0.
  0.11677021  0.12640096  0.          0.          0.          0.          0.
  0.32159916]
15 [ 0.21686309  0.17797901  0.21754149  0.16599987  0.03308995  0.01080746
  0.02732396  0.02957753  0.00769772  0.009619    0.00529578  0.01158329
  0.01136839  0.07525346]
25 [ 0.19617245  0.25228653  0.35405694  0.14093757  0.0070332   0.00354632
  0.00580765  0.00628664  0.00286501  0.00371425  0.00263617  0.00450607
  0.00415622  0.01599497]
35 [ 0.19446952  0.25095945  0.36078556  0.1403522   0.00651257  0.00338136
  0.00537775  0.00582128  0.0027704   0.00358363  0.00255195  0.0046158
  0.00400757  0.01481096]
45 [ 0.19459826  0.25099091  0.36055143  0.1404782   0.00651257  0.00327987
  0.00537775  0.00582128  0.0026191   0.00369956  0.00273462  0.00453201
  0.00399348  0.01481096]
55 [ 0.19445603  0.25113916  0.36049872  0.14047571  0.00651257  0.00332055
  0.00537775  0.00582128  0.00263045  0.0037381   0.00260461  0.004