In [57]:
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import pickle
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [43]:
pct_code_df = pd.read_csv('nypp_precinct_code.csv')

In [44]:
# expand the the_geom for each lon lat to pct_code

pct_dict = {}

for index, row in pct_code_df.iterrows():
    key = int(row['Precinct'])
    lat_lon_arr = [x.strip().split(' ') for x in row['the_geom'].split(',')[1:-1] if '(' not in x and ')' not in x]
    pct_dict[key]=lat_lon_arr

    
test_arr = []
for key in pct_dict.keys():
    arr = pct_dict[key]
    for item in arr:
        temp_arr = item + [key]
        test_arr.append(temp_arr)
        
pct_lat_lon_df = pd.DataFrame(test_arr).rename(columns={0:'lon', 1:'lat', 2:'pct_code'}).astype({'lon': 'float', 'lat':'float'})

In [45]:
pct_lat_lon_df.head()

Unnamed: 0,lon,lat,pct_code
0,-74.043506,40.689687,1
1,-74.042735,40.69005,1
2,-74.042784,40.690121,1
3,-74.042704,40.690155,1
4,-74.042554,40.689963,1


In [46]:
pct_lat_lon_df.shape

(98289, 3)

In [64]:
# split data into features and observation
X = pct_lat_lon_df[['lon','lat']].to_numpy()
y = pct_lat_lon_df[['pct_code']].to_numpy()
y = y.reshape(len(y))

In [65]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [71]:
# create KNN model and train model
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(X_train, y_train)

KNeighborsClassifier()

In [72]:
# test
y_pred = neigh.predict(X_test)

In [73]:
confusion_matrix(y_test, y_pred)

array([[ 374,   15,    3, ...,    0,    0,    0],
       [  15,   30,    0, ...,    0,    0,    0],
       [   5,    0,   46, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,  949,   61,   93],
       [   0,    0,    0, ...,   49,  731,   10],
       [   0,    0,    0, ...,   99,    9, 1145]])

In [74]:
f1_score(y_test, y_pred, average=None)

array([0.95044473, 0.48387097, 0.80701754, 0.65517241, 0.5631068 ,
       0.872     , 0.87387387, 0.35087719, 0.78787879, 0.76190476,
       0.71830986, 0.74074074, 0.21428571, 0.72180451, 0.52631579,
       0.87537994, 0.61386139, 0.34615385, 0.66666667, 0.65656566,
       0.75647668, 0.98889838, 0.71830986, 0.8       , 0.32989691,
       0.83893805, 0.42962963, 0.9788894 , 0.46017699, 0.61676647,
       0.35384615, 0.23668639, 0.77300613, 0.44280443, 0.87292818,
       0.90337284, 0.60810811, 0.95956354, 0.2745098 , 0.23404255,
       0.71770335, 0.96245211, 0.35185185, 0.35714286, 0.736     ,
       0.30909091, 0.8369863 , 0.88607595, 0.20952381, 0.21621622,
       0.29787234, 0.27027027, 0.23703704, 0.66666667, 0.7       ,
       0.83991684, 0.93905192, 0.99904925, 0.99662795, 0.25263158,
       0.29530201, 0.76045627, 0.76086957, 0.78960195, 0.36170213,
       0.84583902, 0.96629866, 0.66276803, 0.84175084, 0.31223629,
       0.92949355, 0.93915344, 0.86605982, 0.821875  , 0.84694

In [75]:
f1_score(y_test, y_pred, average='weighted')

0.8870899345309137

In [7]:
# export model to file
model_filepath='knn_model.pkl'
pickle.dump(neigh, open(model_filepath, mode='wb'))

In [8]:
# read model from file
model = joblib.load(model_filepath)

In [9]:
# predict the precinct for meter info
nyc_meter_df = pd.read_csv('nyc_meter_with_addr.csv')
nyc_meter_df.head()

Unnamed: 0,MeterType,the_geom,MeterNo,LONG,LAT,Status,Borough,X,Y,loc,Addr,geopt
0,Strada,POINT (-73.84754478941632 40.709875987979544),4522508,-73.847545,40.709876,Active,Queens,1026517.0,197940.238028,"('40.709875988', '-73.8475447894')","105-16, Metropolitan Avenue, Parkside, Queens,...","(40.70975955, -73.8476024)"
1,Strada,POINT (-73.84880089683506 40.71007062987071),4522507,-73.848801,40.710071,Active,Queens,1026168.0,198010.548254,"('40.7100706299', '-73.8488008968')","104-06, Metropolitan Avenue, Parkside, Queens,...","(40.7099507, -73.84882478299046)"
2,Strada,POINT (-73.84904582319449 40.710001740642234),4522506,-73.849046,40.710002,Active,Queens,1026100.0,197985.332754,"('40.7100017406', '-73.8490458232')","10402, Metropolitan Avenue, Parkside, Queens, ...","(40.70996785, -73.848967825)"
3,Strada,POINT (-73.849163601086 40.70998012424439),4522505,-73.849164,40.70998,Active,Queens,1026068.0,197977.401011,"('40.7099801242', '-73.8491636011')","93-53, 71st Drive, Parkside, Queens, New York,...","(40.7100427, -73.8492074)"
4,Strada,POINT (-73.84945799115022 40.71018488626701),4522504,-73.849458,40.710185,Active,Queens,1025986.0,198051.861492,"('40.7101848863', '-73.8494579912')","103-14, Metropolitan Avenue, Parkside, Queens,...","(40.71004755, -73.84946264999999)"


In [10]:
# add additional column for pct_code of each meter
nyc_meter_pct = []
for index, row in nyc_meter_df.iterrows():
    lon, lat = row['LONG'], row['LAT']
    nyc_meter_pct.append(model.predict([[lon,lat]])[0])
nyc_meter_df['pct_code'] = nyc_meter_pct

In [11]:
nyc_meter_df.head()

Unnamed: 0,MeterType,the_geom,MeterNo,LONG,LAT,Status,Borough,X,Y,loc,Addr,geopt,pct_code
0,Strada,POINT (-73.84754478941632 40.709875987979544),4522508,-73.847545,40.709876,Active,Queens,1026517.0,197940.238028,"('40.709875988', '-73.8475447894')","105-16, Metropolitan Avenue, Parkside, Queens,...","(40.70975955, -73.8476024)",112
1,Strada,POINT (-73.84880089683506 40.71007062987071),4522507,-73.848801,40.710071,Active,Queens,1026168.0,198010.548254,"('40.7100706299', '-73.8488008968')","104-06, Metropolitan Avenue, Parkside, Queens,...","(40.7099507, -73.84882478299046)",112
2,Strada,POINT (-73.84904582319449 40.710001740642234),4522506,-73.849046,40.710002,Active,Queens,1026100.0,197985.332754,"('40.7100017406', '-73.8490458232')","10402, Metropolitan Avenue, Parkside, Queens, ...","(40.70996785, -73.848967825)",102
3,Strada,POINT (-73.849163601086 40.70998012424439),4522505,-73.849164,40.70998,Active,Queens,1026068.0,197977.401011,"('40.7099801242', '-73.8491636011')","93-53, 71st Drive, Parkside, Queens, New York,...","(40.7100427, -73.8492074)",112
4,Strada,POINT (-73.84945799115022 40.71018488626701),4522504,-73.849458,40.710185,Active,Queens,1025986.0,198051.861492,"('40.7101848863', '-73.8494579912')","103-14, Metropolitan Avenue, Parkside, Queens,...","(40.71004755, -73.84946264999999)",112


In [12]:
# determine to risk factor for each precinct, with 4 being highest and 1 being lowest

# vehicle-related crimes
vehicle_larceny_loc_df = pd.read_csv('vehicle_larceny_loc.csv')
vehicle_larceny_by_pct = pd.DataFrame(vehicle_larceny_loc_df.groupby('ADDR_PCT_CD')['ADDR_PCT_CD'].count()).rename(
    columns={"ADDR_PCT_CD": "COUNT"}).reset_index()
vehicle_larceny_by_pct = vehicle_larceny_by_pct.astype({"ADDR_PCT_CD": str})

In [13]:
vehicle_larceny_by_pct.head()

Unnamed: 0,ADDR_PCT_CD,COUNT
0,1,51
1,5,24
2,6,39
3,7,48
4,9,82


In [14]:
risk = []
for index, row in vehicle_larceny_by_pct.iterrows():
    count = row['COUNT']
    if count <=100:
        risk.append(1)
    elif count > 100 and count <= 200:
        risk.append(2)
    elif count > 200 and count <= 300:
        risk.append(3)
    else:
        risk.append(4)
vehicle_larceny_by_pct['risk_factor'] = risk

In [18]:
vehicle_larceny_by_pct.sample(10)

Unnamed: 0,ADDR_PCT_CD,COUNT,risk_factor
35,62,70,1
55,94,88,1
23,42,156,2
12,23,45,1
10,19,133,2
33,60,53,1
45,75,407,4
34,61,70,1
47,77,76,1
37,66,63,1


In [19]:
vehicle_larceny_by_pct.to_csv('vehicle_larceny_by_pct_risk.csv')

In [20]:
nyc_meter_df.to_csv('nyc_meter_with_addr_pct.csv')

In [33]:
# aisgn meter with risk factors
meter_pct = pd.read_csv('nyc_meter_with_addr_pct.csv').rename(columns={'pct_code':'ADDR_PCT_CD'})
risk_pct = pd.read_csv('vehicle_larceny_by_pct_risk.csv')

In [34]:
risk_pct.head()

Unnamed: 0.1,Unnamed: 0,ADDR_PCT_CD,COUNT,risk_factor
0,0,1,51,1
1,1,5,24,1
2,2,6,39,1
3,3,7,48,1
4,4,9,82,1


In [35]:
meter_pct.head()

Unnamed: 0.1,Unnamed: 0,MeterType,the_geom,MeterNo,LONG,LAT,Status,Borough,X,Y,loc,Addr,geopt,ADDR_PCT_CD
0,0,Strada,POINT (-73.84754478941632 40.709875987979544),4522508,-73.847545,40.709876,Active,Queens,1026517.0,197940.238028,"('40.709875988', '-73.8475447894')","105-16, Metropolitan Avenue, Parkside, Queens,...","(40.70975955, -73.8476024)",112
1,1,Strada,POINT (-73.84880089683506 40.71007062987071),4522507,-73.848801,40.710071,Active,Queens,1026168.0,198010.548254,"('40.7100706299', '-73.8488008968')","104-06, Metropolitan Avenue, Parkside, Queens,...","(40.7099507, -73.84882478299046)",112
2,2,Strada,POINT (-73.84904582319449 40.710001740642234),4522506,-73.849046,40.710002,Active,Queens,1026100.0,197985.332754,"('40.7100017406', '-73.8490458232')","10402, Metropolitan Avenue, Parkside, Queens, ...","(40.70996785, -73.848967825)",102
3,3,Strada,POINT (-73.849163601086 40.70998012424439),4522505,-73.849164,40.70998,Active,Queens,1026068.0,197977.401011,"('40.7099801242', '-73.8491636011')","93-53, 71st Drive, Parkside, Queens, New York,...","(40.7100427, -73.8492074)",112
4,4,Strada,POINT (-73.84945799115022 40.71018488626701),4522504,-73.849458,40.710185,Active,Queens,1025986.0,198051.861492,"('40.7101848863', '-73.8494579912')","103-14, Metropolitan Avenue, Parkside, Queens,...","(40.71004755, -73.84946264999999)",112


In [38]:
meter_pct_risk_df = pd.merge(
    meter_pct,
    risk_pct,
    how="inner",
    on='ADDR_PCT_CD'
)

In [39]:
meter_pct_risk_df.to_csv('meter_pct_risk.csv')

In [40]:
meter_pct_risk_df.head()

Unnamed: 0,Unnamed: 0_x,MeterType,the_geom,MeterNo,LONG,LAT,Status,Borough,X,Y,loc,Addr,geopt,ADDR_PCT_CD,Unnamed: 0_y,COUNT,risk_factor
0,0,Strada,POINT (-73.84754478941632 40.709875987979544),4522508,-73.847545,40.709876,Active,Queens,1026517.0,197940.238028,"('40.709875988', '-73.8475447894')","105-16, Metropolitan Avenue, Parkside, Queens,...","(40.70975955, -73.8476024)",112,68,55,1
1,1,Strada,POINT (-73.84880089683506 40.71007062987071),4522507,-73.848801,40.710071,Active,Queens,1026168.0,198010.548254,"('40.7100706299', '-73.8488008968')","104-06, Metropolitan Avenue, Parkside, Queens,...","(40.7099507, -73.84882478299046)",112,68,55,1
2,3,Strada,POINT (-73.849163601086 40.70998012424439),4522505,-73.849164,40.70998,Active,Queens,1026068.0,197977.401011,"('40.7099801242', '-73.8491636011')","93-53, 71st Drive, Parkside, Queens, New York,...","(40.7100427, -73.8492074)",112,68,55,1
3,4,Strada,POINT (-73.84945799115022 40.71018488626701),4522504,-73.849458,40.710185,Active,Queens,1025986.0,198051.861492,"('40.7101848863', '-73.8494579912')","103-14, Metropolitan Avenue, Parkside, Queens,...","(40.71004755, -73.84946264999999)",112,68,55,1
4,48,Strada,POINT (-73.84960701595227 40.73213265964055),4451773,-73.849607,40.732133,Active,Queens,1025931.0,206048.027586,"('40.7321326596', '-73.849607016')","105-45, 64th Road, Queens, New York, 11375, Un...","(40.7322422, -73.849727)",112,68,55,1
