In [66]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import pandas as pd
import numpy as np

In [67]:
std = pd.read_csv("../Data/FE/crime_num_fe.csv")
pca = pd.read_csv("../Data/FE/crime_num_dr.csv")
pca = pca.iloc[:, 1:]
vs = pd.read_csv("../Data/FE/crime_num_varselect.csv")
vs = vs.iloc[:, 1:]

y = std['Num_Crimes']

# Train Test Split (just want idx)

In [68]:
X = std.drop(['Num_Crimes'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=308, test_size=.2)
train_idx = X_train.index
test_idx = X_test.index

STD

In [69]:
X = std.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [70]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# KNN
knn_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('knn', KNeighborsRegressor())
])
k = range(2,50,2)
params = {
  'knn__n_neighbors': k
}


In [71]:
knn_tune = GridSearchCV(knn_pipe, param_grid=params, scoring='neg_mean_squared_error', cv=5)
knn_tune.fit(X_train, y_train)
print(knn_tune.best_params_)

{'knn__n_neighbors': 14}


In [72]:
# validation
preds = knn_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.92528524210397
Var Red: 0.27758686415704326


In [73]:
# on original data
preds = knn_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.763693011220663
Var Red: 0.38280701237682857


PCA

In [74]:
X = pca.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [75]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# KNN
knn_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('knn', KNeighborsRegressor())
])
k = range(2,50,2)
params = {
  'knn__n_neighbors': k
}

In [76]:
knn_tune = GridSearchCV(knn_pipe, param_grid=params, scoring='neg_mean_squared_error', cv=5)
knn_tune.fit(X_train, y_train)
print(knn_tune.best_params_)

{'knn__n_neighbors': 38}


In [77]:
# validation
preds = knn_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.789427858563622
Var Red: 0.3175645573094259


In [78]:
# on original data
preds = knn_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.7852303350410255
Var Red: 0.36154902068998734


VS

In [80]:
X = vs.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [81]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# KNN
knn_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('knn', KNeighborsRegressor())
])
k = range(2,50,2)
params = {
  'knn__n_neighbors': k
}

In [82]:
knn_tune = GridSearchCV(knn_pipe, param_grid=params, scoring='neg_mean_squared_error', cv=5)
knn_tune.fit(X_train, y_train)
print(knn_tune.best_params_)

{'knn__n_neighbors': 40}


In [83]:
# validation
preds = knn_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.6726150244663
Var Red: 0.34978813894226896


In [84]:
# on original data
preds = knn_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.687790387032204
Var Red: 0.38879451588185465


Output for Preds Plot

In [85]:
np.savetxt("PyPreds/knnpreds.csv", preds, delimiter=",")