In [22]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import pandas as pd
import numpy as np

In [23]:
std = pd.read_csv("../Data/FE/crime_num_fe.csv")
pca = pd.read_csv("../Data/FE/crime_num_dr.csv")
pca = pca.iloc[:, 1:]
vs = pd.read_csv("../Data/FE/crime_num_varselect.csv")
vs = vs.iloc[:, 1:]

y = std['Num_Crimes']

# Train Test Split (just want idx)

In [5]:
X = std.drop(['Num_Crimes'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=308, test_size=.2)
train_idx = X_train.index
test_idx = X_test.index

STD

In [27]:
X = std.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [28]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# RF
rf_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('rf', RandomForestRegressor(min))
])
n_trees = [100,250,500]
max_depth = range(1,5,1)
min_samples = range(1,51,10)
params = {
  'rf__n_estimators': n_trees,
  'rf__max_depth': max_depth,
  'rf__min_samples_leaf': min_samples
}


In [29]:
rf_tune = RandomizedSearchCV(rf_pipe, param_distributions=params, scoring='neg_mean_squared_error', cv=5)
rf_tune.fit(X_train, y_train)
print(rf_tune.best_params_)

{'rf__n_estimators': 500, 'rf__min_samples_leaf': 1, 'rf__max_depth': 4}


In [30]:
# validation
preds = rf_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.6327690393391245
Var Red: 0.3607031294209393


In [31]:
# on original data
preds = rf_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.68789079093206
Var Red: 0.3863931759807948


Plot for Preds

In [32]:
np.savetxt("PyPreds/rfpreds.csv", preds, delimiter=",")

PCA

In [11]:
X = pca.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [12]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# RF
rf_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('rf', RandomForestRegressor(min))
])
n_trees = [100,250,500]
max_depth = range(1,5,1)
min_samples = range(1,51,10)
params = {
  'rf__n_estimators': n_trees,
  'rf__max_depth': max_depth,
  'rf__min_samples_leaf': min_samples
}


In [13]:
rf_tune = RandomizedSearchCV(rf_pipe, param_distributions=params, scoring='neg_mean_squared_error', cv=5)
rf_tune.fit(X_train, y_train)
print(rf_tune.best_params_)

{'rf__n_estimators': 500, 'rf__min_samples_leaf': 21, 'rf__max_depth': 4}


In [14]:
# validation
preds = rf_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.792393271923041
Var Red: 0.3153959797356284


In [15]:
# on original data
preds = rf_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.855682964752859
Var Red: 0.3416966722882666


VS

In [17]:
X = vs.drop(['Num_Crimes', 'Date','Day', 'Month', 'Year','TS_interval'], axis=1)
X_train = X.iloc[train_idx,]
X_test = X.iloc[test_idx,]

In [18]:
# transform
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# RF
rf_pipe = Pipeline([
    ('preprocessor', preprocessor), 
    ('rf', RandomForestRegressor(min))
])
n_trees = [100,250,500]
max_depth = range(1,5,1)
min_samples = range(1,51,10)
params = {
  'rf__n_estimators': n_trees,
  'rf__max_depth': max_depth,
  'rf__min_samples_leaf': min_samples
}

In [19]:
rf_tune = RandomizedSearchCV(rf_pipe, param_distributions=params, scoring='neg_mean_squared_error', cv=5)
rf_tune.fit(X_train, y_train)
print(rf_tune.best_params_)

{'rf__n_estimators': 500, 'rf__min_samples_leaf': 11, 'rf__max_depth': 4}


In [20]:
# validation
preds = rf_tune.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y_test, preds)[0,1]**2}") # var reduction

RMSE: 4.660555039768128
Var Red: 0.35271974655364396


In [21]:
# on original data
preds = rf_tune.predict(X)
print(f"RMSE: {mean_squared_error(y, preds, squared=False)}") # rmse
print(f"Var Red: {np.corrcoef(y, preds)[0,1]**2}") # var reduction

RMSE: 4.736585100315489
Var Red: 0.37358146814327275
