In [2]:
import re
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV,ElasticNet, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression
import matplotlib.pyplot as plt

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

### Choosing Variables

In [3]:
x = train.drop(columns = "price")
y = train['price'] 

# Get numeric variables :)
x_num = x.select_dtypes(include = 'number')

# Get rid of dumb variables
x_num = x_num.drop(columns = ['bedrooms', 'host_id'])

x_nums = ['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location']

x_cat = ['property_type', 'neighbourhood_cleansed', 'host_is_superhost']

x_full = x[['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location','property_type', 'neighbourhood_cleansed', 'host_is_superhost']]

# Split training data into train/test :)
x_train, x_test, y_train, y_test = train_test_split(x_full, y, test_size= 0.2, random_state=317)

#### PIPELINES FOR THE WIN

#### KNN Pipeline

In [4]:

# Build Pipeline
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy= 'mean')),
    ('scale', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse_output= False, handle_unknown='ignore')),
    ('select', SelectPercentile(f_regression, percentile = 20))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, x_nums),
    ('cat', cat_pipe, x_cat)
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors= 60, weights='distance'))
])
# Fit pipeline to training data
pipe.fit(x_train, y_train)

# Report test MSE
y_preds = pipe.predict(x_test)
root_mean_squared_error(y_test, y_preds)

1271.8671457994592

#### Lasso Pipeline

In [8]:
pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', LassoCV(max_iter=10000))
])
# Fit pipeline to training data
pipe2.fit(x_train, y_train)

# Report test MSE
y_preds2 = pipe2.predict(x_test)
root_mean_squared_error(y_test, y_preds2)

1230.5487384321339

In [14]:
# After fitting LassoCV model
pipe2.named_steps['lasso'].coef_

array([ 1.74380287e+02,  4.32487166e+01, -1.88839852e+02, -3.00433289e+01,
        4.39650330e+01, -8.84783862e+01, -2.41918863e+00, -4.21327613e+02,
        1.48840780e+01, -1.26226609e+01,  1.08906002e+01, -4.91136660e-01,
        4.35827671e+01, -9.95082772e+00,  8.99359986e+00,  1.44556746e+02,
       -1.90940529e+01, -5.19797631e+00, -4.18181472e+00,  2.06887234e+00,
       -1.50955317e+01,  1.99166536e+01,  1.83236529e+02,  0.00000000e+00,
        8.44027853e+01,  7.39652401e+02, -0.00000000e+00, -6.08885655e+00,
       -6.54485825e+00, -4.64580690e+01, -1.86468423e+02, -0.00000000e+00,
        0.00000000e+00, -7.90409263e+00,  1.45118837e+03,  1.09002972e+02,
       -3.76328389e+01, -6.51392563e+02, -1.99780332e+01, -0.00000000e+00,
        3.89560553e+02, -0.00000000e+00,  1.47858347e+02, -6.85852995e+01,
        4.10809461e+01, -2.17999507e+01,  4.49557134e+02,  9.11097073e+02,
       -0.00000000e+00,  5.70380634e+02,  1.08119191e+03, -1.81109527e+01,
       -1.20584718e+02, -

In [13]:
x_train.columns

Index(['host_listings_count', 'accommodates', 'number_of_reviews',
       'review_scores_rating', 'review_scores_location', 'property_type',
       'neighbourhood_cleansed', 'host_is_superhost'],
      dtype='object')

### Actual Test Set :)

In [6]:
test_small = test[['host_listings_count', 'accommodates', 'number_of_reviews', 'review_scores_rating', 'review_scores_location','property_type', 'neighbourhood_cleansed', 'host_is_superhost']]
predictions = pipe2.predict(test_small)

output = pd.DataFrame({
    'Id': test["Id"],
    "Price": predictions
})

# Save the DataFrame to CSV
output.to_csv('JH-predicted_prices.csv', index=False)

output.head()

Unnamed: 0,Id,Price
0,PSJEN,-94.857539
1,PVZW7,83.672983
2,EJLAM,1004.647578
3,SDHPB,-595.160199
4,MJGYX,443.447546
