In [36]:
import re
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV,ElasticNet, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

### EDA 

In [14]:
x = train.drop(columns = "price")
y = train['price']

# Picked some variables I want to test for effect
x_vars = train[['host_id', 'host_is_superhost', 'host_listings_count', 'latitude', 'longitude', 'accommodates', 'has_availability', 'review_scores_rating', 'review_scores_location']]

# Got just the unit id
ids = train['Id']

# Changed my boolean vars to 1's and 0's
x_vars.loc[:, 'host_is_superhost'] = x_vars['host_is_superhost'].map({'f': 0, 't': 1})
x_vars.loc[:, 'has_availability'] = x_vars['has_availability'].map({'f': 0, 't': 1})
x_vars['host_is_superhost'].fillna(0, inplace = True)
x_vars['host_listings_count'].fillna(0, inplace = True)
x_vars['review_scores_rating'].fillna(train['review_scores_rating'].median(), inplace=True)
x_vars['review_scores_location'].fillna(train['review_scores_location'].median(), inplace=True)

x_vars.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_vars['host_is_superhost'].fillna(0, inplace = True)
  x_vars['host_is_superhost'].fillna(0, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_vars['host_is_superhost'].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.m

Unnamed: 0,host_id,host_is_superhost,host_listings_count,latitude,longitude,accommodates,has_availability,review_scores_rating,review_scores_location
0,4211733,0.0,1.0,37.815091,-122.237531,4,1,4.93,4.93
1,1257432,1.0,32.0,37.770767,-122.427483,2,1,5.0,5.0
2,9219277,1.0,10.0,37.75927,-122.48258,2,1,5.0,5.0
3,13149124,1.0,3.0,37.41096,-122.12144,1,1,5.0,4.86
4,501999278,0.0,1781.0,37.37019,-122.04374,4,1,4.75,4.88


### Feature Selection

In [17]:
selector = SelectKBest(score_func=f_regression, k = 'all')

selector.fit(x_vars,y)

# Get results
var_summary = pd.DataFrame({
    'feature' : x_vars.columns,
    'score' : selector.scores_,
    'p-value' : selector.pvalues_
})
var_summary

Unnamed: 0,feature,score,p-value
0,host_id,1.34489,0.2461915
1,host_is_superhost,29.622354,5.33055e-08
2,host_listings_count,1.166578,0.2801226
3,latitude,10.539054,0.001171302
4,longitude,28.208798,1.10435e-07
5,accommodates,68.365908,1.468959e-16
6,has_availability,1.773796,0.1829328
7,review_scores_rating,9.309839,0.002283176
8,review_scores_location,4.449744,0.03492271


#### Variables to Include :)
* host_is_superhost
* Latitude
* Longitude
* Accomodates
* review_scores_rating
* review_scores_location


In [30]:
x1 = x_vars[['host_is_superhost', 'latitude', 'longitude', 'accommodates', 'review_scores_rating', 'review_scores_location']]

#### Fit KNN and predict

In [45]:
# Standardizing some stuff :)
scalar = StandardScaler()
x1_scale = scalar.fit_transform(x1)

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1_scale, y, test_size= 0.2, random_state=317)

knn = KNeighborsRegressor(n_neighbors= 10)
knn.fit(x_train, y_train)

# Predict and calc RMSE
yhat = knn.predict(x_test)

root_mean_squared_error(y_test, yhat)

1317.2068038783077

### Actual Test Set :)

In [None]:
x_test = test.drop(columns = "price")
x1_test = x_test[['host_is_superhost', 'latitude', 'longitude', 'accommodates', 'review_scores_rating', 'review_scores_location']]
x1_test.loc[:, 'host_is_superhost'] = x_vars['host_is_superhost'].map({'f': 0, 't': 1})
x1_test['host_is_superhost'].fillna(0, inplace = True)
x_vars['review_scores_rating'].fillna(train['review_scores_rating'].median(), inplace=True)
x_vars['review_scores_location'].fillna(train['review_scores_location'].median(), inplace=True)