In [7]:
import re
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV,ElasticNet, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression
import matplotlib.pyplot as plt

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

### Choosing Variables

In [29]:
x = train.drop(columns = "price")
y = train['price'] 

# Get numeric variables :)
x_num = x.select_dtypes(include = 'number')

# Get rid of dumb variables
x_num = x_num.drop(columns = ['bedrooms', 'host_id'])

x_nums = ['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location']

x_cat = ['property_type', 'neighbourhood_cleansed', 'host_is_superhost']

x_full = x[['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location','property_type', 'neighbourhood_cleansed', 'host_is_superhost']]

# Split training data into train/test :)
x_train, x_test, y_train, y_test = train_test_split(x_full, y, test_size= 0.2, random_state=317)

#### PIPELINES FOR THE WIN

#### KNN Pipeline

In [30]:

# Build Pipeline
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy= 'mean')),
    ('scale', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse_output= False, handle_unknown='ignore')),
    ('select', SelectPercentile(f_regression, percentile = 20))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, x_nums),
    ('cat', cat_pipe, x_cat)
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors= 60, weights='distance'))
])
# Fit pipeline to training data
pipe.fit(x_train, y_train)

# Report test MSE
y_preds = pipe.predict(x_test)
root_mean_squared_error(y_test, y_preds)

1271.8671457994592

#### Lasso Pipeline

In [31]:
pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', LassoCV(max_iter=10000))
])
# Fit pipeline to training data
pipe2.fit(x_train, y_train)

# Report test MSE
y_preds2 = pipe2.predict(x_test)
root_mean_squared_error(y_test, y_preds2)

1230.5487384321339

### Actual Test Set :)

In [32]:
test_small = test[['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location','property_type', 'neighbourhood_cleansed', 'host_is_superhost']]
predictions = pipe2.predict(test_small)

output = pd.DataFrame({
    'Id': test["Id"],
    "Price": predictions
})

# Save the DataFrame to CSV
output.to_csv('JH-predicted_prices.csv', index=False)