# Kaggle: San Francisco Crime Classification
## Improvement as part of sdap17 excercise 3

In [16]:
import pandas as pd
import numpy as np
import pprint
import requests

## Exploration of the training data set

In [17]:
train_data = pd.read_csv("../../data/raw/train.csv")
train_data['Dates'] = pd.to_datetime(train_data['Dates'])

test_data = pd.read_csv("../../data/raw/test.csv")
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

In [18]:
len(train_data)

878049

In [19]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [20]:
crimes = train_data['Category'].unique()
pprint.pprint("Crimes: {}, #{}".format(crimes, len(crimes)), indent=2)

("Crimes: ['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' "
 "'VANDALISM'\n"
 " 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'\n"
 " 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'\n"
 " 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'\n"
 " 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'\n"
 " 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'\n"
 " 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'\n"
 " 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'\n"
 " 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT'], #39")


In [21]:
train_data['Category'].value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

# Generate time based features

In [22]:
def get_halfhour(minute):
    if minute < 30:
        return 0
    else:
        return 1

def get_daynight(hour):
    if 5 < hour and hour < 23:
        return 0
    else:
        return 1
    
def generate_time_features(times):
    minute_series = pd.Series([x.minute for x in times], name='minute')
    halfhour_series = pd.Series([get_halfhour(x.minute) for x in times], name='halfhour')
    hour_series = pd.Series([x.hour for x in times], name='hour')
    daynight_series = pd.Series([get_daynight(x.hour) for x in times], name='day_night')
    day_series = pd.Series([x.day for x in times], name='day')
    month_series = pd.Series([x.month for x in times], name='month')
    year_series = pd.Series([x.year for x in times], name='year')
    
    time_features = pd.concat([minute_series, halfhour_series, hour_series, daynight_series, day_series, month_series, year_series], axis=1)
    return time_features

In [23]:
times = train_data["Dates"]

In [24]:
time_features = generate_time_features(times)
print("success")

success


In [25]:
print(time_features)

        minute  halfhour  hour  day_night  day  month  year
0           53         1    23          1   13      5  2015
1           53         1    23          1   13      5  2015
2           33         1    23          1   13      5  2015
3           30         1    23          1   13      5  2015
4           30         1    23          1   13      5  2015
5           30         1    23          1   13      5  2015
6           30         1    23          1   13      5  2015
7           30         1    23          1   13      5  2015
8            0         0    23          1   13      5  2015
9            0         0    23          1   13      5  2015
10          58         1    22          0   13      5  2015
11          30         1    22          0   13      5  2015
12          30         1    22          0   13      5  2015
13           6         0    22          0   13      5  2015
14           0         0    22          0   13      5  2015
15           0         0    22          

## Create grid for sector analysis

In [26]:
# outliers are all at position X = -120.5, Y = 90

def filter_x(x):
    if (x > -122):
        return -122.4483364
    else: 
        return x
    
def filter_y(y):
    if y > 37.9:
        return 37.7563690
    else:
        return y

In [27]:
# take a look at the positions of our train data.
min_x_train = min([filter_x(x) for x in train_data["X"]]) 
max_x_train = max([filter_x(x) for x in train_data["X"]]) 
min_y_train = min([filter_y(y) for y in train_data["Y"]]) 
max_y_train = max([filter_y(y) for y in train_data["Y"]]) 
print("Min_X_train: ", min_x)
print("Max_X_train: ", max_x)
print("Min_Y_train: ", min_y)
print("Max_Y_train: ", max_y)

Min_X_train:  -122.513642064
Max_X_train:  -122.364750704
Min_Y_train:  37.7078790224
Max_Y_train:  37.8206208381


In [28]:
# take a look at the positions of our test data.
min_x_test = min([filter_x(x) for x in test_data["X"]]) 
max_x_test = max([filter_x(x) for x in test_data["X"]]) 
min_y_test = min([filter_y(y) for y in test_data["Y"]]) 
max_y_test = max([filter_y(y) for y in test_data["Y"]]) 
print("Min_X_test: ", min_x)
print("Max_X_test: ", max_x)
print("Min_Y_test: ", min_y)
print("Max_Y_test: ", max_y) 

Min_X_test:  -122.513642064
Max_X_test:  -122.364750704
Min_Y_test:  37.7078790224
Max_Y_test:  37.8206208381


In [66]:
# Final coordinates for grid that covers San Francisco.
min_x = -122.53
max_x = -122.35
min_y = 37.65
max_y = 37.84

dif_x = max_x - min_x
dif_y = max_y - min_y

In [68]:
# grid functions

def get_subregion_mid(subregion_id, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = subregion_id % x_sections
    x_pos = ((x + 1/2) / x_sections) * dif_x + min_x
    y = subregion_id // x_sections
    y_pos = ((y + 1/2) / y_sections) * dif_y + min_y
    return (x_pos, y_pos)

def get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = pos_x - min_x
    x_sec = int(x_sections * x / dif_x)
    y = pos_y - min_y
    y_sec = int(y_sections * y / dif_y)
    return x_sec + x_sections * y_sec
    
def get_subregion_series(data, min_x, min_y, dif_x, dif_y):
    X_SECTIONS = 20
    Y_SECTIONS = 20
    subregion_list = []
    for i in range(len(data)):
        pos_x = data["X"][i]
        pos_y = data["Y"][i]
        subregion = get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, X_SECTIONS, Y_SECTIONS)
        subregion_list.append(subregion)
    return pd.Series(subregion_list, name='subregion')

In [69]:
subregion = get_subregion_series(train_data, min_x, min_y, dif_x, dif_y)

In [71]:
# look at the numer of crimes in each subregion
subregion.value_counts()

293       74546
292       53553
273       44346
272       43574
274       39301
252       30585
232       25023
294       18081
313       17712
251       16341
212       15606
271       15095
270       12688
175       12615
233       12410
314       12002
253       11647
291       11593
290       11016
195       10852
248       10356
230        9019
176        8894
332        8889
213        8388
269        8049
250        7950
150        7929
310        7860
311        7708
          ...  
285        1053
131        1050
286         965
188         951
244         785
135         769
243         757
228         752
284         751
152         696
124         696
189         577
156         569
163         553
125         433
261         384
143         263
216         232
162         195
326         110
144         108
357          93
110425       67
236          54
136          50
197          49
283          43
123          24
328           2
142           1
Name: subregion, dtype: 

In [74]:
# highest crime rate arround union square
get_subregion_mid(142 , min_x, min_y, dif_x, dif_y, 20, 20)

(-122.5075, 37.72125)