# San Francisco Crime Classification

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shapefile
import time

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report



## Import Data
Import data and check the shape of train and test set.
We want to determine how to split out the train set into train and dev.

In [2]:
import csv
from datetime import datetime
from time import time
ff = "./Data/train.csv" # you will need to edit this directory
with open(ff, 'rt') as f:
    reader = csv.reader(f)
    train_data = list(reader)
ff = "./Data/test.csv" # you will need to edit this directory
with open(ff, 'rt') as f:
    reader = csv.reader(f)
    test_data = list(reader)
    
#Convert to pandas data frame for better analysis
train_data_full = pd.DataFrame(train_data[1:], columns = train_data[0])
test_data = pd.DataFrame(test_data[1:], columns = test_data[0])
print ("Shape of train data", train_data_full.shape)
print ("Shape of test data", test_data.shape)

Shape of train data (878049, 9)
Shape of test data (884262, 7)


Run basic totals

In [3]:
category_count = train_data_full.groupby(['Category'])['Dates'].count()

print ("Training Data: Top 5 crimes")
print (category_count.sort_values(ascending=False)[:5])
print ("")

print ("Training Data: Lowest 5 crimes")
print (category_count.sort_values(ascending=True)[:5])
print ("")

print ("Training Data: All Category Counts")
print (category_count.sort_values(ascending=False))
print ("")

print ("Training Data: Cumulative Sum of Counts for Top 5 Crimes")
print (category_count.sort_values(ascending=False)[:5].cumsum())

#Top 5 crimes make up over 50% of the training data set.
#Focusing on prediciting these as accurately as possible may give us a better accuracy score overall 

Training Data: Top 5 crimes
Category
LARCENY/THEFT     174900
OTHER OFFENSES    126182
NON-CRIMINAL       92304
ASSAULT            76876
DRUG/NARCOTIC      53971
Name: Dates, dtype: int64

Training Data: Lowest 5 crimes
Category
TREA                           6
PORNOGRAPHY/OBSCENE MAT       22
GAMBLING                     146
SEX OFFENSES NON FORCIBLE    148
EXTORTION                    256
Name: Dates, dtype: int64

Training Data: All Category Counts
Category
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SEC

Converting X and Y coordinates into San Francisco neighborhoods using Zillows database via shapefiles

Shapefiles can be read via the pyshp package (https://pypi.python.org/pypi/pyshp)

https://www.zillow.com/howto/api/neighborhood-boundaries.htm

In [4]:
#read in the shapefile provided by zillow for california
sf = shapefile.Reader("./Data/ZillowNeighborhoods-CA.shp")
fields = sf.fields[1:] 
field_names = [field[0] for field in fields] 
nhood_list = []

#iterate through the shapefile records and retrieve the properties of each shapefile record 
#as well as its bbox coordinates 
#bbox: If the shape type contains multiple points this tuple describes the lower left (x,y) 
#coordinate and upper right corner coordinate creating a complete box around the points. 
#If the shapeType is a Null (shapeType == 0) then an AttributeError is raised.

#we will use the bbox to determine if our X and Y coordinates from the training data
#fall within the bbox of each neighborhood and then assign that neighborhood to the training data
for r in sf.shapeRecords():  
    atr = dict(zip(field_names, r.record))
    bbox = r.shape.bbox
    if atr['City'] == 'San Francisco':
        #print (dict(properties=atr,bbox=bbox))
        new_dict=dict(properties=atr,bbox=bbox)
        nhood_list.append(new_dict)
        
def coord_in_bbox(bbox, X, Y):
    if X>=bbox[0] and X<=bbox[2] and Y>=bbox[1] and Y<=bbox[3]:
        return True
    else:
        return False

def neighborhood(nhood_list, X, Y):
    for n in nhood_list:
        bbox = n['bbox']
        X = float(X)
        Y = float(Y)
        if coord_in_bbox(bbox, X, Y):
            return n['properties']['Name']

#Add neighborhood to train data
train_data_full['Neighborhood'] = train_data_full.apply(lambda x: neighborhood(nhood_list,x['X'],x['Y']),axis=1)
        



Neighborhood Counts

In [5]:
neighborhood_count = train_data_full.groupby(['Neighborhood'])['Dates'].count()

print ("Training Data: Top 5 Criminal Neighborhoods")
print (neighborhood_count.sort_values(ascending=False)[:5])
print ("")

print ("Training Data: Lowest 5 Criminal Neighborhoods")
print (neighborhood_count.sort_values(ascending=True)[:5])
print ("")

print ("Training Data: All Neighborhood Counts")
print (neighborhood_count.sort_values(ascending=False))
print ("")

print ("Training Data: Cumulative Sum of Counts for Top 5 Neighborhoods")
print (neighborhood_count.sort_values(ascending=False)[:5].cumsum())

#Top 5 neighborhoods make up almost 50% of the training data set.
#Also next steps is to bring in the category counts by neighborhood to see how these relate as well.

Training Data: Top 5 Criminal Neighborhoods
Neighborhood
South of Market    174414
Downtown            90894
Mission             83617
South Beach         36298
Bayview             32651
Name: Dates, dtype: int64

Training Data: Lowest 5 Criminal Neighborhoods
Neighborhood
Monterey Heights     14
Clarendon Heights    15
Buena Vista Park     72
Balboa Terrace       78
Treasure Island      93
Name: Dates, dtype: int64

Training Data: All Neighborhood Counts
Neighborhood
South of Market                             174414
Downtown                                     90894
Mission                                      83617
South Beach                                  36298
Bayview                                      32651
Bernal Heights                               28829
Hunters Point                                22562
Hayes Valley                                 22519
Western Addition                             21963
Excelsior                                    21074
Haight-Ashbury   

## Generate Date and Time Features
Season  
Month  
Week  
Day / Day of Week  
Hour

In [6]:
#convert Dates to datetime
train_data_full['FinalDate'] = pd.to_datetime(train_data_full['Dates'], format='%Y-%m-%d %H:%M:%S')

In [7]:
#create functions to extract the needed datetime features
def season(date):
    spring = range(80, 172)
    summer = range(172, 264)
    fall = range(264, 355)
    if date.timetuple().tm_yday in spring:
        return 2 #Spring
    elif date.timetuple().tm_yday in summer:
        return 3 #Summer
    elif date.timetuple().tm_yday in fall:
        return 4 #Fall
    else:
        return 1 #Winter
    
def getTimeCat(date):
    # extract time categories
    timecat = 4
    ts =  datetime.strptime(str(date),  '%Y-%m-%d %H:%M:%S').time()

    # --> Morning = 0400-1000
    mornStart = datetime.strptime('2016-12-31 04:01',  '%Y-%m-%d %H:%M').time()
    mornEnd = datetime.strptime('2016-12-31 10:00',  '%Y-%m-%d %H:%M').time()
    
    # --> Midday = 1000-1600
    midStart = datetime.strptime('2016-12-31 10:01',  '%Y-%m-%d %H:%M').time()
    midEnd = datetime.strptime('2016-12-31 16:00',  '%Y-%m-%d %H:%M').time()

    # --> Evening = 1600-2300
    eveStart = datetime.strptime('2016-12-31 16:01',  '%Y-%m-%d %H:%M').time()
    eveEnd = datetime.strptime('2016-12-31 23:00',  '%Y-%m-%d %H:%M').time()

    # --> Late Night = 2300-0400
    lateStart1 = datetime.strptime('2016-12-31 23:01',  '%Y-%m-%d %H:%M').time()
    lateEnd1 = datetime.strptime('2016-12-31 23:59',  '%Y-%m-%d %H:%M').time()
    lateStart2 = datetime.strptime('2016-12-31 00:01',  '%Y-%m-%d %H:%M').time()
    lateEnd2 = datetime.strptime('2016-12-31 04:01',  '%Y-%m-%d %H:%M').time()

    
    if ts >= mornStart and ts <= mornEnd:
      timecat = 0 #morning
    elif ts >= midStart and ts <= midEnd:
      timecat = 1 #midday
    elif ts >= eveStart and ts <= eveEnd:
      timecat = 2 #evening
    elif ts >= lateStart1 and ts <= lateEnd1:
      timecat = 3 #late night
    elif ts >= lateStart2 and ts <= lateEnd2:
      timecat = 3 #late night

    return timecat



In [8]:
#apply the functions
train_data_full['Season'] = train_data_full['FinalDate'].apply(lambda x: season(x))
train_data_full['DayOfMonth'] = train_data_full['FinalDate'].apply(lambda x: x.day)
train_data_full['Week'] = train_data_full['FinalDate'].apply(lambda x: x.week)
train_data_full['TimeCat'] = train_data_full['FinalDate'].apply(lambda x: getTimeCat(x))
train_data_full['Hour'] = train_data_full['FinalDate'].apply(lambda x: x.hour)

In [9]:
#get basic counts of each new feature
print("Season Counts")
print(train_data_full.groupby(['Season'])['Dates'].count())
print("")
print("Day of Month Counts")
print(train_data_full.groupby(['DayOfMonth'])['Dates'].count())
print("")
print("Week # Counts")
print(train_data_full.groupby(['Week'])['Dates'].count())
print("")
print("Time Category Counts")
print(train_data_full.groupby(['TimeCat'])['Dates'].count())
print("")
print("Hour Counts")
print(train_data_full.groupby(['Hour'])['Dates'].count())

Season Counts
Season
1    221171
2    227590
3    218856
4    210432
Name: Dates, dtype: int64

Day of Month Counts
DayOfMonth
1     32167
2     27471
3     28691
4     29905
5     29557
6     29482
7     29685
8     30339
9     29502
10    28395
11    27952
12    28223
13    28580
14    27670
15    28224
16    28146
17    29031
18    29793
19    30012
20    29963
21    30038
22    30589
23    29547
24    27987
25    26932
26    26870
27    27577
28    27269
29    27108
30    26589
31    14755
Name: Dates, dtype: int64

Week # Counts
Week
2     35857
4     35573
6     35564
8     35249
10    36474
12    35881
14    35797
16    35916
18    35545
20    34292
22    32560
24    31276
26    32698
28    32220
30    32674
32    32941
34    34014
36    33507
38    33974
40    34557
42    33911
44    34681
46    32813
48    30867
50    30431
52    28777
Name: Dates, dtype: int64

Time Category Counts
TimeCat
0    132011
1    266188
2    341211
3    138639
Name: Dates, dtype: int64

Hour Counts


Normalize and Standardize Date and X,Y

In [10]:
def normalize(series_field, df, new_field_name):
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    series_field = series_field.reshape((len(series_field), 1))
    scaler = scaler.fit(series_field)
    normalized = scaler.transform(series_field)
    df[new_field_name] = normalized
    return df

def standardize(series_field, df, new_field_name):
    scaler = preprocessing.StandardScaler()
    series_field = series_field.reshape((len(series_field), 1))
    scaler = scaler.fit(series_field)
    standardized = scaler.transform(series_field)
    df[new_field_name] = standardized
    return df

train_data_full = normalize(train_data_full['FinalDate'], train_data_full, 'DateNorm')
train_data_full = standardize(train_data_full['FinalDate'], train_data_full, 'DateStand')
train_data_full = normalize(train_data_full['X'], train_data_full, 'XNorm')
train_data_full = standardize(train_data_full['X'], train_data_full, 'XStand')
train_data_full = normalize(train_data_full['Y'], train_data_full, 'YNorm')
train_data_full = standardize(train_data_full['Y'], train_data_full, 'YStand')



In [11]:
#get a list of features and their data types
train_data_full.dtypes

Dates                   object
Category                object
Descript                object
DayOfWeek               object
PdDistrict              object
Resolution              object
Address                 object
X                       object
Y                       object
Neighborhood            object
FinalDate       datetime64[ns]
Season                   int64
DayOfMonth               int64
Week                     int64
TimeCat                  int64
Hour                     int64
DateNorm               float64
DateStand              float64
XNorm                  float64
XStand                 float64
YNorm                  float64
YStand                 float64
dtype: object

In [12]:
#convert to matrix for model processing
X_full = train_data_full.as_matrix()

In [13]:
X_full.shape

(878049, 22)