In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import warnings
import matplotlib.pylab as plt

In [135]:
def preprocess_data(df, numeric_features, categorical_features, label, random_state=7, test_size=0.2):
    X = df[numeric_features + numeric_features_standard]
    le = LabelEncoder()
    y = le.fit_transform(df[label])
    
    # We create the preprocessing pipelines for both numeric and categorical data.
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)])
    
    X_cv, X_validation, y_cv, y_validation = train_test_split(X, y, test_size=0.33, random_state=20)
    
    return X_cv, X_validation, y_cv, y_validation, preprocessor

In [136]:
df = pd.read_csv("preprocessed_for_grid.csv")

In [137]:
# numeric_features = ['latitude', 'longitude', 'availability_365']
numeric_features = ['minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count', 
                    'time', 'latitude', 'longitude', 'availability_365']
categorical_features = ['neighbourhood_group','neighbourhood', 'room_type','race', 'name']
label = 'price'

In [138]:
X_cv, X_validation, y_cv, y_validation, preprocessor =  preprocess_data(df, numeric_features,
                                                                        categorical_features, label)

In [139]:
classes, counts = np.unique(y_validation,return_counts=True)
print('balance:',np.max(counts/len(y_validation)))

balance: 0.042203767972235996


In [145]:
X_validation

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,time,latitude,longitude,availability_365,minimum_nights.1,number_of_reviews.1,reviews_per_month.1,calculated_host_listings_count.1,time.1
48435,2,0,0.00,7,0,40.69549,-73.94160,150,2,0,0.00,7,0
5850,2,112,1.94,2,386,40.76369,-73.92291,0,2,112,1.94,2,386
24547,1,10,0.42,1,685,40.80760,-73.94252,0,1,10,0.42,1,685
1043,4,33,0.39,1,226,40.70633,-74.00974,0,4,33,0.39,1,226
18475,3,3,0.13,1,619,40.71937,-73.95332,0,3,3,0.13,1,619
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23120,2,3,0.12,1,772,40.74204,-73.90639,0,2,3,0.12,1,772
34014,1,34,3.01,1,22,40.76782,-73.98351,20,1,34,3.01,1,22
25862,3,2,0.80,3,43,40.70408,-73.92045,169,3,2,0.80,3,43
21728,3,0,0.00,1,0,40.71704,-73.95371,0,3,0,0.00,1,0


In [141]:
def cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_splits=5, n_repeats=3, random_state=7, preprocess=True, verbose=5):
    
    keys = list(param_grid.keys())
    for key in keys:
        param_grid['classifier__' + key] = param_grid.pop(key)
    
    if preprocess:
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    else:
        pipe = Pipeline(steps=[('classifier', model)])
        
    grid = GridSearchCV(pipe, param_grid, cv=RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state**2), n_jobs=-1, return_train_score=True, 
                                scoring=['accuracy', 'neg_log_loss'], refit='accuracy', verbose=verbose)
    
    grid.fit(X_cv, y_cv)
    
    return grid

In [142]:
model = LogisticRegression(random_state=123)
param_grid = {'C': [.01, .05, .1, .5, 1]}
grid_LR = cross_validate_model(model, param_grid, preprocessor, X_cv, y_cv, n_repeats=3, verbose=0)
# get_cross_validation_results(grid_LR, X_validation, y_validation)



ValueError: 'neighbourhood_group' is not in list