In [4]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, LinearRegression

In [7]:
final_test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

X = train.drop(columns=['price'])
y = train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=307)

In [None]:
#lasso = Lasso(alpha=1.0, random_state=307)
#scaler = StandardScaler(with_mean=False)
#model = Pipeline(steps=[('preprocessor', preprocessor),
#                        ('scaler', scaler),
#                        ('lasso', lasso)])

#model.fit(X_train, y_train)
#selected_features = X_train.columns[model.named_steps['lasso'].coef_ != 0]
#X_train_selected = X_train[selected_features]
#X_test_selected = X_test[selected_features]

In [5]:
from sklearn.ensemble import RandomForestRegressor

cat_pipe = Pipeline([
  ('impute', SimpleImputer(strategy='most_frequent')),
  ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
  ('select', SelectPercentile(f_regression, percentile=50))
])
num_pipe = Pipeline([
  ('impute', SimpleImputer(strategy='mean')),
  ('poly', PolynomialFeatures(degree=2, include_bias=False)),
  ('standardize', StandardScaler())
])
preprocessing = ColumnTransformer(
  transformers=[
    ('cat', cat_pipe, ['name', 
                       'neighborhood_overview',
                       'host_location',
                       'host_acceptance_rate',
                       'host_is_superhost',
                       'host_has_profile_pic',
                       'room_type',
                       'bathrooms_text',
                       'instant_bookable']),  
    ('num', num_pipe, ['accommodates',
                      'beds',
                      'availability_365',
                      'number_of_reviews',
                      'review_scores_accuracy']) 
  ])

pipe_full = Pipeline([
  ('preprocessing', preprocessing),
  ('model', RandomForestRegressor(random_state=307))
])

params = {
  'model__n_estimators': [50, 100, 200, 300],
  'model__max_depth': [None, 5, 10],
  'model__min_samples_split': [2, 5, 10],
  'model__min_samples_leaf': [1, 2, 4],
  'model__max_features': ['auto', 'sqrt', 'log2'],
  'preprocessing__cat__select__percentile':(list(range(10,100,10))),
  'preprocessing__num__poly__degree':(1,2,3),
  'preprocessing__num__impute__strategy':('mean','median'),
  'preprocessing__cat__impute__strategy':('most_frequent','constant')
}

rs = RandomizedSearchCV(pipe_full, param_distributions=params, scoring='neg_mean_squared_error', cv=10)

In [9]:
rs.fit(X_train, y_train)

KeyboardInterrupt: 

In [8]:
y_train_pred = rs.predict(X_train)
training_mse = mean_squared_error(y_train, y_train_pred)
yhat = rs.predict(X_test)
test_mse = mean_squared_error(y_test, yhat)
print("Training MSE:", training_mse)
print("Test MSE:", test_mse)

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.