# Predicting crime in San Francisco with machine learning (Kaggle 2015)

In [51]:
import zipfile

import matplotlib as mpl
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn_pandas import DataFrameMapper

In [52]:
%matplotlib inline

In [53]:
DATADIR = "./data/"
CLIP_BOX = (-122.3366, -122.5247, 37.699, 37.8299)

In [54]:
train, test = None, None

with zipfile.ZipFile(DATADIR + "train.csv.zip", "r") as zf:
        train = pd.read_csv(zf.open("train.csv"), parse_dates=['Dates'])
        
with zipfile.ZipFile(DATADIR + "test.csv.zip", "r") as zf:
        test = pd.read_csv(zf.open("test.csv"), parse_dates=['Dates'])

In [55]:
for dataset in (train, test):
    dataset.columns = dataset.columns.str.lower()
    print "--- rawdata ---"
    print "columns: %s" % ", ".join(dataset.columns)
    print "items: %d, lat: %f/%f long: %f/%f\n" % (dataset.size, dataset["x"].max(), dataset["x"].min(), dataset["y"].max(), dataset["y"].min())
    
    # Mangling data
    dataset["x"] = dataset[dataset.x<CLIP_BOX[0]].x
    dataset["x"] = dataset[dataset.x>CLIP_BOX[1]].x
    dataset["y"] = dataset[dataset.y>CLIP_BOX[2]].y
    dataset["y"] = dataset[dataset.y<CLIP_BOX[3]].y
    
    dataset["hour"] = dataset["dates"].map(lambda x: x.hour)
    dataset["weekday"] = dataset["dates"].map(lambda x: x.weekday())
    dataset["day"] = dataset["dates"].map(lambda x: x.day)
    dataset["month"] = dataset["dates"].map(lambda x: x.month)
    dataset["year"] = dataset["dates"].map(lambda x: x.year)

    print "--- data ---"
    print "columns: %s" % ", ".join(dataset.columns)
    print "items: %d, lat: %f/%f long: %f/%f\n" % (dataset.size, dataset["x"].max(), dataset["x"].min(), dataset["y"].max(), dataset["y"].min())

--- rawdata ---
columns: dates, category, descript, dayofweek, pddistrict, resolution, address, x, y
items: 7902441, lat: -120.500000/-122.513642 long: 90.000000/37.707879

--- data ---
columns: dates, category, descript, dayofweek, pddistrict, resolution, address, x, y, hour, weekday, day, month, year
items: 12292686, lat: -122.364937/-122.513642 long: 37.819975/37.707879

--- rawdata ---
columns: id, dates, dayofweek, pddistrict, address, x, y
items: 6189834, lat: -120.500000/-122.513642 long: 90.000000/37.707879

--- data ---
columns: id, dates, dayofweek, pddistrict, address, x, y, hour, weekday, day, month, year
items: 10611144, lat: -122.364751/-122.513642 long: 37.820621/37.707879



In [56]:
train

Unnamed: 0,dates,category,descript,dayofweek,pddistrict,resolution,address,x,y,hour,weekday,day,month,year
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23,2,13,5,2015
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23,2,13,5,2015
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23,2,13,5,2015
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23,2,13,5,2015
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23,2,13,5,2015
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431,23,2,13,5,2015
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,23,2,13,5,2015
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,23,2,13,5,2015
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601,23,2,13,5,2015
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802,23,2,13,5,2015


In [57]:
test

Unnamed: 0,id,dates,dayofweek,pddistrict,address,x,y,hour,weekday,day,month,year
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,23,6,10,5,2015
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,23,6,10,5,2015
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,23,6,10,5,2015
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,23,6,10,5,2015
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412,23,6,10,5,2015
5,5,2015-05-10 23:40:00,Sunday,TARAVAL,BROAD ST / CAPITOL AV,-122.459024,37.713172,23,6,10,5,2015
6,6,2015-05-10 23:30:00,Sunday,INGLESIDE,100 Block of CHENERY ST,-122.425616,37.739351,23,6,10,5,2015
7,7,2015-05-10 23:30:00,Sunday,INGLESIDE,200 Block of BANKS ST,-122.412652,37.739750,23,6,10,5,2015
8,8,2015-05-10 23:10:00,Sunday,MISSION,2900 Block of 16TH ST,-122.418700,37.765165,23,6,10,5,2015
9,9,2015-05-10 23:10:00,Sunday,CENTRAL,TAYLOR ST / GREEN ST,-122.413935,37.798886,23,6,10,5,2015


In [58]:
train_view = train[["category", "x", "y", "hour", "weekday", "day", "month", "year"]]
test_view = test[["id", "x", "y", "hour", "weekday", "day", "month", "year"]]

In [59]:
train_mapper = DataFrameMapper([
    ("category", sk.preprocessing.LabelBinarizer()),
    ("x", None),
    ("y", None),
    ("hour", None),
    ("weekday", None),
    ("day", None),
    ("month", None),
    ("year", None)
])

train_mapper.fit_transform(train_view.copy())

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.30000000e+01,   5.00000000e+00,   2.01500000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.30000000e+01,   5.00000000e+00,   2.01500000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.30000000e+01,   5.00000000e+00,   2.01500000e+03],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.00000000e+00,   1.00000000e+00,   2.00300000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.00000000e+00,   1.00000000e+00,   2.00300000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.00000000e+00,   1.00000000e+00,   2.00300000e+03]])