 # "Predicting Crime Location"
> "We try to predict the possible location of crime given the neighbourhood based on crimetype, date, time and neighbourhood. We use goehashing to deal with latitude and longitude and encode it to single feature."

- author: Bhargav Lad
- toc: false
- badges: true
- comments: true
- image: images/crime_loc.jpeg
- categories: [ jupyter,matplotlib,sklearn,randomforest, KNN, geohashing, feature-engineering]

In [5]:
# import all dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from datetime import datetime,date
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from pyproj import Proj
import folium
warnings.filterwarnings("ignore")

In [6]:
# Read the dataset
df = pd.read_csv('crimedata_csv_all_years.csv')
df.head(10)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2012,12,14,8,52,,Oakridge,491285.0,5453433.0
1,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0
2,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0
3,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0
4,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0
5,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0
6,Break and Enter Commercial,2009,7,1,0,35,10XX ALBERNI ST,West End,491021.385727,5459161.0
7,Break and Enter Commercial,2013,6,1,20,0,10XX ALBERNI ST,West End,491032.270497,5459150.0
8,Break and Enter Commercial,2014,4,17,5,50,10XX ALBERNI ST,West End,491032.270497,5459150.0
9,Break and Enter Commercial,2014,9,1,14,20,10XX ALBERNI ST,West End,491032.270497,5459150.0


In [7]:
df.isnull().sum()

TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR                 0
MINUTE               0
HUNDRED_BLOCK       13
NEIGHBOURHOOD    64574
X                  119
Y                  119
dtype: int64

# Drop all instaces with missing X,Y and Hundred Block

In [8]:
df.dropna(subset=['X','Y','HUNDRED_BLOCK'],inplace=True)

In [9]:
df.isnull().sum()

TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR                 0
MINUTE               0
HUNDRED_BLOCK        0
NEIGHBOURHOOD    64455
X                    0
Y                    0
dtype: int64

# Converting all X,Y from UTM to lat,lng

In [10]:
p = Proj(proj='utm',zone=10,ellps='WGS84', preserve_units=False)

In [11]:
lat_lng = [] 
_ = df[['X','Y']].apply(lambda x:  lat_lng.append(p(x.X,x.Y,inverse=True)),axis=1)

In [12]:
lng = []
lat = []
for i in range(len(lat_lng)):
    lng.append(lat_lng[i][0])
    lat.append(lat_lng[i][1])
df['LAT'] = lat
df['LNG'] = lng

In [13]:
df.head(20)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG
1,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029
2,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649
3,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536
4,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461
5,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461
6,Break and Enter Commercial,2009,7,1,0,35,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461
7,Break and Enter Commercial,2013,6,1,20,0,10XX ALBERNI ST,West End,491032.270497,5459150.0,49.285034,-123.123311
8,Break and Enter Commercial,2014,4,17,5,50,10XX ALBERNI ST,West End,491032.270497,5459150.0,49.285034,-123.123311
9,Break and Enter Commercial,2014,9,1,14,20,10XX ALBERNI ST,West End,491032.270497,5459150.0,49.285034,-123.123311
10,Break and Enter Commercial,2017,11,14,20,0,10XX ALBERNI ST,West End,491051.085574,5459144.0,49.284981,-123.123053


# Using KNN with havesine distance

In [14]:
# Imputing Neighbourhood values

train_df = df.dropna(subset=['NEIGHBOURHOOD'])
test_df=df[df['NEIGHBOURHOOD'].isnull()]

test_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG
38536,Break and Enter Commercial,2007,9,23,18,5,X NK_LOC ST,,492757.48676,5458792.0,49.281843,-123.099582
38537,Break and Enter Commercial,2007,11,1,14,38,X NK_LOC ST,,492757.48676,5458792.0,49.281843,-123.099582
38538,Break and Enter Commercial,2005,5,18,18,30,"X NK_LOC ST ""SQUAMISH""",,492757.48676,5458792.0,49.281843,-123.099582
104648,Break and Enter Residential/Other,2004,6,3,14,20,X NK_LOC ST,,492757.48676,5458792.0,49.281843,-123.099582
104649,Break and Enter Residential/Other,2008,3,13,7,0,X NK_LOC ST,,492757.48676,5458792.0,49.281843,-123.099582


In [15]:
x_train = train_df[['LAT','LNG']]
y_train = train_df['NEIGHBOURHOOD']

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5,metric='haversine')
knn_classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
pred_neig = knn_classifier.predict(test_df[['LAT','LNG']])

In [18]:
test_df['NEIGHBOURHOOD'] = pred_neig
pred_neig[:20]

array(['Central Business District', 'Central Business District',
       'Central Business District', 'Central Business District',
       'Central Business District', 'Central Business District',
       'Marpole', 'Marpole', 'Marpole', 'Marpole', 'Marpole', 'Marpole',
       'Marpole', 'Marpole', 'Marpole', 'Marpole', 'Marpole', 'Marpole',
       'Marpole', 'Marpole'], dtype=object)

In [19]:
# Creating a new DataFrame with no missing values

new_df = df[0:0]
new_df=pd.concat([new_df,train_df],ignore_index=True)


new_df=pd.concat([new_df,test_df],ignore_index=True)
new_df.isnull().sum()

TYPE             0
YEAR             0
MONTH            0
DAY              0
HOUR             0
MINUTE           0
HUNDRED_BLOCK    0
NEIGHBOURHOOD    0
X                0
Y                0
LAT              0
LNG              0
dtype: int64

# Feature engineering

### Adding new date feature

In [20]:
new_df['DATE'] = new_df.apply(lambda r: datetime(year=r.YEAR,month=r.MONTH,day=r.DAY,hour=r.HOUR,minute=r.MINUTE),axis=1)
new_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG,DATE
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029,2019-03-07 02:06:00
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649,2019-08-27 04:12:00
2,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536,2014-08-08 05:13:00
3,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2005-11-14 03:09:00
4,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2006-05-21 04:50:00


### Adding Weekday field

In [21]:
new_df['WEEKDAY'] = new_df['DATE'].dt.dayofweek
new_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG,DATE,WEEKDAY
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029,2019-03-07 02:06:00,3
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649,2019-08-27 04:12:00,1
2,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536,2014-08-08 05:13:00,4
3,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2005-11-14 03:09:00,0
4,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2006-05-21 04:50:00,6


### Adding Holiday field

In [22]:
ca_holidays = holidays.Canada(prov='BC')

new_df['HOLIDAY'] = new_df.apply(lambda r: 1 if r.DATE in ca_holidays else 0,axis=1)
new_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG,DATE,WEEKDAY,HOLIDAY
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029,2019-03-07 02:06:00,3,0
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649,2019-08-27 04:12:00,1,0
2,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536,2014-08-08 05:13:00,4,0
3,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2005-11-14 03:09:00,0,0
4,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2006-05-21 04:50:00,6,0


# Hashing latitude and longitude to geohash

In [26]:
import geohash_hilbert as ghh
new_df['GEOHASH'] = new_df.apply(lambda r: ghh.encode(r.LNG, r.LAT, precision=8,bits_per_char=4),axis=1)
new_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG,DATE,WEEKDAY,HOLIDAY,CRIME_TYPE,GEOHASH
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029,2019-03-07 02:06:00,3,0,B&E,5ed43e02
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649,2019-08-27 04:12:00,1,0,B&E,5ed43e44
2,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536,2014-08-08 05:13:00,4,0,B&E,5ed43e44
3,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2005-11-14 03:09:00,0,0,B&E,5ed43e44
4,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2006-05-21 04:50:00,6,0,B&E,5ed43e44


In [27]:
(np.unique(new_df.NEIGHBOURHOOD))

array(['Arbutus Ridge', 'Central Business District', 'Dunbar-Southlands',
       'Fairview', 'Grandview-Woodland', 'Hastings-Sunrise',
       'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
       'Marpole', 'Mount Pleasant', 'Musqueam', 'Oakridge',
       'Renfrew-Collingwood', 'Riley Park', 'Shaughnessy', 'South Cambie',
       'Stanley Park', 'Strathcona', 'Sunset', 'Victoria-Fraserview',
       'West End', 'West Point Grey'], dtype=object)

In [28]:
new_df_copy = new_df.copy()

In [29]:
for col in list(np.unique(new_df.NEIGHBOURHOOD)):
    print(col,len(new_df[new_df['NEIGHBOURHOOD']==col]),len(new_df.GEOHASH[new_df['NEIGHBOURHOOD']==col].value_counts()))

Arbutus Ridge 6790 47
Central Business District 138630 49
Dunbar-Southlands 8752 65
Fairview 36503 44
Grandview-Woodland 31413 54
Hastings-Sunrise 21126 82
Kensington-Cedar Cottage 28232 70
Kerrisdale 8428 73
Killarney 11799 70
Kitsilano 30441 59
Marpole 77012 61
Mount Pleasant 36105 48
Musqueam 572 19
Oakridge 9219 43
Renfrew-Collingwood 30892 88
Riley Park 14549 54
Shaughnessy 6289 54
South Cambie 5990 26
Stanley Park 4163 35
Strathcona 25565 40
Sunset 19599 63
Victoria-Fraserview 12263 61
West End 48389 28
West Point Grey 6721 47


In [30]:
len(new_df[new_df['NEIGHBOURHOOD']=='Central Business District'].GEOHASH.value_counts())

49

# Categorzing crimes

In [31]:
Severe_crimes = ['Vehicle Collision or Pedestrian Struck (with Fatality)',
                'Homicide','Offence Against a Person','Vehicle Collision or Pedestrian Struck (with Injury)']
Theft = ['Theft from Vehicle','Other Theft','Theft of Vehicle','Theft of Bicycle']



for idx,row in new_df.iterrows():
    if str(row['TYPE']) in Severe_crimes:
        new_df.at[idx,'CRIME_TYPE'] = 'SEVERE'
    elif str(row['TYPE']) in Theft:
        new_df.at[idx,'CRIME_TYPE'] = 'Theft'
    elif str(row['TYPE']) == 'Mischief':
        new_df.at[idx,'CRIME_TYPE'] = 'Mischief'
    else:
        new_df.at[idx,'CRIME_TYPE'] = 'B&E'
new_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,LAT,LNG,DATE,WEEKDAY,HOLIDAY,CRIME_TYPE,GEOHASH
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0,49.266678,-123.129029,2019-03-07 02:06:00,3,0,B&E,5ed43e02
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0,49.285255,-123.123649,2019-08-27 04:12:00,1,0,B&E,5ed43e44
2,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0,49.285181,-123.123536,2014-08-08 05:13:00,4,0,B&E,5ed43e44
3,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2005-11-14 03:09:00,0,0,B&E,5ed43e44
4,Break and Enter Commercial,2006,5,21,4,50,10XX ALBERNI ST,West End,491021.385727,5459161.0,49.285132,-123.123461,2006-05-21 04:50:00,6,0,B&E,5ed43e44


In [32]:
new_df.sort_values(by='DATE',inplace=True)

In [33]:
def get_neighbourhood_data(data_frame,neighbourhood):
    neighbourhood_data = data_frame.groupby('NEIGHBOURHOOD')
    return neighbourhood_data.get_group(neighbourhood)[['CRIME_TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE'
                                                        , 'WEEKDAY', 'HOLIDAY','GEOHASH']]


# Get data from particular neighbourhood

In [99]:
neigh = 'West End'
neigh_df = get_neighbourhood_data(new_df,neigh)

In [100]:
neigh_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48389 entries, 538267 to 109496
Data columns (total 9 columns):
CRIME_TYPE    48389 non-null object
YEAR          48389 non-null int64
MONTH         48389 non-null int64
DAY           48389 non-null int64
HOUR          48389 non-null int64
MINUTE        48389 non-null int64
WEEKDAY       48389 non-null int64
HOLIDAY       48389 non-null int64
GEOHASH       48389 non-null object
dtypes: int64(7), object(2)
memory usage: 3.7+ MB


In [101]:
from sklearn.preprocessing import LabelBinarizer
# one hot encode crimetype field
lb_crimetype = LabelBinarizer()

neigh_df = neigh_df.join(pd.DataFrame(lb_crimetype.fit_transform(neigh_df['CRIME_TYPE']),
                          columns=lb_crimetype.classes_, 
                          index=neigh_df.index))


In [102]:
neigh_df.columns

Index(['CRIME_TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'WEEKDAY',
       'HOLIDAY', 'GEOHASH', 'B&E', 'Mischief', 'SEVERE', 'Theft'],
      dtype='object')

# Split Train and Test

In [103]:
neigh_test = neigh_df[-5:]
neigh_train = neigh_df[:-5]

In [104]:
neigh_train.shape

(48384, 13)

In [105]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(min_samples_leaf=5)



forest.fit(neigh_train.drop(['GEOHASH','CRIME_TYPE'],axis=1), neigh_train['GEOHASH'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [106]:
pos_pred = forest.predict(neigh_test.drop(['GEOHASH','CRIME_TYPE'],axis=1))

In [107]:
pos_prob =forest.predict_proba(neigh_test.drop(['GEOHASH','CRIME_TYPE'],axis=1))

In [108]:
for a,b in zip(pos_pred,neigh_test['GEOHASH']):
    print(a,b)

5ed43e3e 5ed43e3e
5ed43e43 5ed43e4e
5ed43e4e 5ed43e3f
5ed43e43 5ed43e43
5ed43e3e 5ed43e3e


In [109]:
pos_prob.shape,forest.classes_

((5, 28), array(['5ed43db0', '5ed43db1', '5ed43db2', '5ed43db3', '5ed43db7',
        '5ed43dbc', '5ed43dbd', '5ed43dbe', '5ed43dbf', '5ed43dc0',
        '5ed43dc1', '5ed43dc3', '5ed43dc4', '5ed43e31', '5ed43e38',
        '5ed43e3a', '5ed43e3b', '5ed43e3c', '5ed43e3d', '5ed43e3e',
        '5ed43e3f', '5ed43e40', '5ed43e41', '5ed43e42', '5ed43e43',
        '5ed43e44', '5ed43e4e', '5ed43e4f'], dtype=object))

In [110]:
# Predict the top n probable locations where crime could occur
top_n = 3
pred_loc = [np.argsort(prob)[:top_n] for prob in pos_prob]


In [111]:
hashesh = forest.classes_
p = [[ghh.decode(hashesh[i],bits_per_char=4) for i in x] for x in pred_loc]
p

[[(-123.13751220703125, 49.294281005859375),
  (-123.13201904296875, 49.275054931640625),
  (-123.12652587890625, 49.277801513671875)],
 [(-123.13751220703125, 49.294281005859375),
  (-123.13201904296875, 49.288787841796875),
  (-123.13201904296875, 49.277801513671875)],
 [(-123.13751220703125, 49.294281005859375),
  (-123.13201904296875, 49.288787841796875),
  (-123.13201904296875, 49.286041259765625)],
 [(-123.13751220703125, 49.294281005859375),
  (-123.13201904296875, 49.286041259765625),
  (-123.13201904296875, 49.275054931640625)],
 [(-123.13751220703125, 49.294281005859375),
  (-123.13201904296875, 49.275054931640625),
  (-123.14849853515625, 49.291534423828125)]]

In [119]:
# Create a map centered on Vancouver

map_van = folium.Map(location=[49.24, -123.11], zoom_start = 12,tiles='cartodbpositron')
for pnt in pnt_arr:

    folium.Circle(location=[pnt[1], pnt[0]],radius=500,popup='Predicted',fill_color="#3db7e4").add_to(map_van)

map_van.save(f'predicted loaction.html')
map_van