# Introduction to K-Nearest Neighbors

In [1]:
import pandas as pd
import numpy as np

In [2]:
dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings.head(2)

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC


In [3]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3723 entries, 0 to 3722
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   host_response_rate    3289 non-null   object 
 1   host_acceptance_rate  3109 non-null   object 
 2   host_listings_count   3723 non-null   int64  
 3   accommodates          3723 non-null   int64  
 4   room_type             3723 non-null   object 
 5   bedrooms              3702 non-null   float64
 6   bathrooms             3696 non-null   float64
 7   beds                  3712 non-null   float64
 8   price                 3723 non-null   object 
 9   cleaning_fee          2335 non-null   object 
 10  security_deposit      1426 non-null   object 
 11  minimum_nights        3723 non-null   int64  
 12  maximum_nights        3723 non-null   int64  
 13  number_of_reviews     3723 non-null   int64  
 14  latitude              3723 non-null   float64
 15  longitude            

## What is the similarity metric
## How to  choose the k value
***
* The similarity metric works by comparing a fixed set of numerical **features**, another word for attributes, between 2 observations, or living spaces.
* For prediction of continuous like price, the main similarity metric that's used is **Euclidean distance**.
\begin{equation}
d = \sqrt{(q_1 - p_1)^2 + (q_2 - p_2)^2 + \cdots + (q_n - p_n)^2}
\end{equation}
** If onlu using one feature, it's known as the **univariate case**
\begin{equation}
d = \sqrt{(q_1 - p_1)^2}
\\
\text{The square root simplifies to}
\\
d = |q_1 - p_1|
\end{equation}

In [5]:
first_distance = abs(3-4)

In [7]:
distance = abs(dc_listings.accommodates - 3)
dc_listings['distance'] = distance
distance.value_counts()

1     2294
2      503
0      461
3      279
5       73
4       35
7       22
6       17
9       12
13       8
8        7
12       6
11       4
10       2
Name: accommodates, dtype: int64

In [9]:
dc_listings.loc[dc_listings.distance==0, 'accommodates']

26      3
34      3
36      3
40      3
44      3
       ..
3675    3
3697    3
3707    3
3714    3
3722    3
Name: accommodates, Length: 461, dtype: int64

In [12]:
np.random.seed(1)
random_values = np.random.permutation(dc_listings.shape[0])
random_values

array([ 574, 1593, 3091, ..., 1096,  235, 1061])

In [14]:
np.random.permutation(5)

array([3, 0, 2, 4, 1])

In [16]:
dc_listings = dc_listings.loc[random_values]
dc_listings

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state,distance
574,100%,100%,1,2,Private room,1.0,1.0,1.0,$125.00,,$300.00,1,4,149,38.913548,-77.031981,Washington,20009,DC,1
1593,87%,100%,2,2,Private room,1.0,1.5,1.0,$85.00,$15.00,,1,30,49,38.953431,-77.030695,Washington,20011,DC,1
3091,100%,,1,1,Private room,1.0,0.5,1.0,$50.00,,,1,1125,1,38.933491,-77.029679,Washington,20010,DC,2
420,58%,51%,480,2,Entire home/apt,1.0,1.0,1.0,$209.00,$150.00,,4,730,2,38.904054,-77.051991,Washington,20037,DC,1
808,100%,95%,3,12,Entire home/apt,5.0,2.0,5.0,$215.00,$135.00,$100.00,2,1825,34,38.906118,-76.988873,Washington,20002,DC,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2763,100%,100%,1,1,Entire home/apt,0.0,1.0,1.0,$75.00,,,4,20,1,38.924325,-77.034065,Washington,20009,DC,2
905,100%,25%,6,1,Entire home/apt,0.0,1.0,1.0,$95.00,$50.00,,5,1125,0,38.905723,-77.057786,Washington,20007,DC,2
1096,92%,99%,23,2,Shared room,1.0,0.0,1.0,$100.00,,,1,1125,15,38.907376,-77.044780,Washington,20036,DC,1
235,100%,63%,2,8,Entire home/apt,2.0,1.0,4.0,$194.00,$89.00,,1,1125,8,38.899648,-77.017537,Washington,20001,DC,5


In [17]:
dc_listings.sort_values('distance', inplace=True)
dc_listings.iloc[:10]

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state,distance
577,98%,52%,49,3,Private room,1.0,1.0,2.0,$185.00,,,2,14,1,38.908356,-77.028146,Washington,20005,DC,0
2166,100%,89%,2,3,Entire home/apt,1.0,1.0,1.0,$180.00,,$100.00,1,14,10,38.905808,-77.000012,Washington,20002,DC,0
3631,98%,52%,49,3,Entire home/apt,1.0,1.0,2.0,$175.00,,,3,14,1,38.889065,-76.993576,Washington,20003,DC,0
71,100%,94%,1,3,Entire home/apt,1.0,1.0,1.0,$128.00,$40.00,,1,1125,9,38.87996,-77.006491,Washington,20003,DC,0
1011,,,1,3,Entire home/apt,0.0,1.0,1.0,$115.00,,,1,1125,0,38.907382,-77.035075,Washington,20005,DC,0
380,58%,51%,480,3,Entire home/apt,,1.0,1.0,$219.00,,,4,1125,0,38.90082,-77.052956,Washington,20037,DC,0
943,,,1,3,Private room,1.0,1.0,1.0,$125.00,$25.00,,1,1125,3,38.913882,-77.038468,Washington,20009,DC,0
3107,,,1,3,Entire home/apt,1.0,1.0,1.0,$250.00,,,1,1125,0,38.934027,-77.035193,Washington,20010,DC,0
1499,62%,77%,1,3,Entire home/apt,0.0,1.0,2.0,$94.00,,,1,1125,2,38.87803,-77.019914,Washington,20024,DC,0
625,100%,0%,1,3,Entire home/apt,1.0,1.0,1.0,$150.00,,,1,1125,5,38.907031,-77.02951,Washington,20005,DC,0


In [19]:
dc_listings.price = dc_listings.price.str.replace('$','').str.replace(',','').astype(float)

In [21]:
mean_price = dc_listings.price.iloc[:5].mean()
mean_price

156.6

In [22]:
dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings.price = dc_listings['price'].str.replace(',','').str.replace('$', '').astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]

In [23]:
def predict_price(new_listing):
    temp_df = dc_listings.copy()
    temp_df['distance'] = abs(temp_df['accommodates'] - new_listing)
    temp_df = temp_df.sort_values('distance')
    mean_price = temp_df.iloc[:5]['price'].mean()
    return mean_price

acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
print(acc_one, acc_two, acc_four)

57.6 102.6 195.8


In [24]:
import math
def euclidean_distance(x,y):
    return math.sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))
print(euclidean_distance([0,3,4,5], [7,6,3,-1]))

9.746794344808963


# Evaluating Model Perfomance
## train/test validation

In [26]:
train_df = dc_listings.iloc[:2792]
test_df = dc_listings.iloc[2792:]

In [27]:
def predict_price(new_listing):
    ## DataFrame.copy() performs a deep copy
    temp_df = train_df.copy()
    temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbor_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbor_prices.mean()
    return predicted_price

In [28]:
test_df['predicted_price'] = test_df.accommodates.apply(predict_price)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Error metric: 
A class of metric that quantifies how good the predictions were on teh test set.
## Mean absolute error
\begin{equation}
MAE = \frac{1}{n}\sum_{k=1}^n|(actual_1 - predicted_1)| + \cdots + |(actual_n - predicted_n)|
\end{equation}

In [33]:
mae = (np.absolute(test_df.price - test_df.predicted_price)).mean()
mae

64.88485499462966

In [34]:
test_df['error'] = np.absolute(test_df.predicted_price - test_df.price)
test_df['error'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


64.88485499462966

## Mean squared error:
To penalize predicted values that are further away from the actual value far more than those closer to the actual value.
* **MSE** makes the gap between the predicted and actual values more clear.
\begin{equation}
MSE = \frac{1}{n} \sum_{k=1}^{n} (actual_1 - predicted_1)^{2} + \cdots + (actual_n - predicted_n)^{2}
\end{equation}

In [35]:
mse = np.square(test_df.error).mean()
mse

10791.289495166504

In [36]:
def predict_price(new_listing):
    ## DataFrame.copy() performs a deep copy
    temp_df = train_df.copy()
    temp_df['distance'] = temp_df['bathrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbor_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbor_prices.mean()
    return predicted_price

In [37]:
test_df['predicted_price'] = test_df.bathrooms.apply(predict_price)
test_df['squared_error'] = (test_df['predicted_price'] - test_df['price']) ** 2
mse = test_df['squared_error'].mean()
mse

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


10690.458861439267

In [38]:
## RMSE
rmse = np.sqrt(mse)
rmse

103.39467520834556

## Different error metrics
\begin{equation}
MAE = \frac{1}{n} \sum_{k=1}^{n} \lvert (actual_1 - predicted_1) \rvert + \cdots + \lvert (actual_n - predicted_n) \rvert
\\
RMSE = \sqrt { \frac{ \sum_{k=1}^{n} (actual_1 - predicted_1)^2 + \cdots + (actual_n - predicted_n)^2 } {n} }
\end{equation}
## MAE grow linearly
## Looking at the ratio of MAE to RMSE can help us understand if there are large but infrequent errors

In [41]:
np.random.seed(1)
dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(dc_listings.shape[0])]
dc_listings['price'] = dc_listings['price'].str.replace(',','').str.replace('$','').astype('float')
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   host_response_rate    3289 non-null   object 
 1   host_acceptance_rate  3109 non-null   object 
 2   host_listings_count   3723 non-null   int64  
 3   accommodates          3723 non-null   int64  
 4   room_type             3723 non-null   object 
 5   bedrooms              3702 non-null   float64
 6   bathrooms             3696 non-null   float64
 7   beds                  3712 non-null   float64
 8   price                 3723 non-null   float64
 9   cleaning_fee          2335 non-null   object 
 10  security_deposit      1426 non-null   object 
 11  minimum_nights        3723 non-null   int64  
 12  maximum_nights        3723 non-null   int64  
 13  number_of_reviews     3723 non-null   int64  
 14  latitude              3723 non-null   float64
 15  longitude          

### Remove non-numerical values: room_type, city, state, numerical but non-ordinal values: latitude, longitude, zipcode, also columns describe the host and not the living space itself: host_response_rate, host_acceptance_rate, host_listings_count

In [42]:
dc_listings.drop(['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate',
                 'host_acceptance_rate', 'host_listings_count'], axis=1, inplace=True)
dc_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews
574,2,1.0,1.0,1.0,125.0,,$300.00,1,4,149
1593,2,1.0,1.5,1.0,85.0,$15.00,,1,30,49
3091,1,1.0,0.5,1.0,50.0,,,1,1125,1
420,2,1.0,1.0,1.0,209.0,$150.00,,4,730,2
808,12,5.0,2.0,5.0,215.0,$135.00,$100.00,2,1825,34


In [43]:
dc_listings.isnull().sum()

accommodates            0
bedrooms               21
bathrooms              27
beds                   11
price                   0
cleaning_fee         1388
security_deposit     2297
minimum_nights          0
maximum_nights          0
number_of_reviews       0
dtype: int64

* 3 columns have a few missing values (less than 1%): bedrooms, bathrooms, beds, can drop the rows with null valus
* 2 columns have a large number of missing values: cleaning_fee, security_deposit, revove these two columns

In [44]:
dc_listings = dc_listings.drop(['cleaning_fee', 'security_deposit'], axis = 1)

In [46]:
rows_to_drop = dc_listings[dc_listings.bedrooms.isnull() | dc_listings.bathrooms.isnull() | 
                          dc_listings.beds.isnull()]
rows_to_drop.sample(5)

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
3474,6,3.0,,,2000.0,4,4,0
2087,4,,1.0,2.0,325.0,4,7,0
2290,4,,1.0,1.0,149.0,4,365,34
1972,4,1.0,,,275.0,1,1125,0
1749,2,1.0,,,190.0,2,14,0


In [47]:
dc_listings.drop(rows_to_drop.index, inplace = True)
# dc_listings.dropna(inplace=True) also works.
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3671 entries, 574 to 1061
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       3671 non-null   int64  
 1   bedrooms           3671 non-null   float64
 2   bathrooms          3671 non-null   float64
 3   beds               3671 non-null   float64
 4   price              3671 non-null   float64
 5   minimum_nights     3671 non-null   int64  
 6   maximum_nights     3671 non-null   int64  
 7   number_of_reviews  3671 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 258.1 KB


In [48]:
dc_listings.isnull().sum()

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

In [49]:
dc_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,2,1.0,1.0,1.0,125.0,1,4,149
1593,2,1.0,1.5,1.0,85.0,1,30,49
3091,1,1.0,0.5,1.0,50.0,1,1125,1
420,2,1.0,1.0,1.0,209.0,4,730,2
808,12,5.0,2.0,5.0,215.0,2,1825,34


In [50]:
dc_listings.describe()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
count,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0,3671.0
mean,3.195587,1.209752,1.257695,1.64778,148.843639,2.235358,588519.4,15.106783
std,2.00419,0.840801,0.586803,1.184549,137.550045,3.618777,35443910.0,29.236563
min,1.0,0.0,0.0,1.0,10.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,85.0,1.0,120.0,1.0
50%,2.0,1.0,1.0,1.0,115.0,2.0,1125.0,4.0
75%,4.0,1.0,1.0,2.0,165.0,3.0,1125.0,16.0
max,16.0,10.0,8.0,16.0,2822.0,180.0,2147484000.0,362.0


* To prevent any single column from having too much of an impact on the distance, we can **normalize** all of the columns to have a mean of 0 and a standard deviation of 1.
   * from each value, substract the mean of the column
   * divide each value by the standard deviation of the column.
   \begin{equation}
   x = \frac{x-\mu}{\sigma}
   \end{equation}
   x is a value in a specific column, $\mu$ is the mean of all the values in the column, $\sigma$ is the standard deviation ofall the value in the column

In [51]:
normalized_col = (dc_listings.maximum_nights - dc_listings.maximum_nights.mean()) / dc_listings.maximum_nights.std()

In [54]:
columns_except_price = dc_listings.loc[:, dc_listings.columns != 'price'].copy()
normalized_listings = (columns_except_price - columns_except_price.mean()) / columns_except_price.std()
normalized_listings['price'] = dc_listings.price
normalized_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,minimum_nights,maximum_nights,number_of_reviews,price
574,-0.596544,-0.249467,-0.439151,-0.546858,-0.341375,-0.016604,4.57965,125.0
1593,-0.596544,-0.249467,0.412923,-0.546858,-0.341375,-0.016603,1.159275,85.0
3091,-1.095499,-0.249467,-1.291226,-0.546858,-0.341375,-0.016573,-0.482505,50.0
420,-0.596544,-0.249467,-0.439151,-0.546858,0.487635,-0.016584,-0.448301,209.0
808,4.393004,4.507903,1.264998,2.829956,-0.065038,-0.016553,0.646219,215.0


In [56]:
columns = dc_listings.copy()
normalized_listings = (columns - columns.mean()) /columns.std()
normalized_listings['price'] = dc_listings.price
normalized_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,-0.596544,-0.249467,-0.439151,-0.546858,125.0,-0.341375,-0.016604,4.57965
1593,-0.596544,-0.249467,0.412923,-0.546858,85.0,-0.341375,-0.016603,1.159275
3091,-1.095499,-0.249467,-1.291226,-0.546858,50.0,-0.341375,-0.016573,-0.482505
420,-0.596544,-0.249467,-0.439151,-0.546858,209.0,0.487635,-0.016584,-0.448301
808,4.393004,4.507903,1.264998,2.829956,215.0,-0.065038,-0.016553,0.646219


## scipy.spatial diantance.euclidean()

In [57]:
from scipy.spatial import distance
first_list = [0,1]
second_list = [3,4]
distance.euclidean(first_list, second_list)

4.242640687119285

In [60]:
first = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth = normalized_listings.iloc[4][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first, fifth)
first_fifth_distance

5.272543124668404

## Scikit-learn workflow:
* instantiate the specific machine learning model you want to use
* fit the model to the training data
* use the model to make predictions
* evaluate the accuracy of the predictions
***
## KNeighborsRegressor class


In [61]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
# default parameters: n_neighbors: the number of neghbors, is set to 5; algorightm: for computing nearest neghbors,
# is set to auto; p: set to 2, corresponding to Euclidean distance.

In [62]:
knn = KNeighborsRegressor(algorithm='brute')

## Fit the model to the data using the fit method. For all models, the fit method takes in 2 required parameters:
* matrix-like object, containning the feature columns we want to use from the training set
* list-like object, containing correct target values.

In [64]:
# Split full dataset into train and test sets.
train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]

# Matrix-like object, containing just the 2 columns of interest from training set
train_features = train_df[['accommodates', 'bathrooms']]

# List-like object, containing just the target column, 'price'
train_target = train_df['price']
# Pass everything into the fit model
knn.fit(train_features, train_target)

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [65]:
predictions = knn.predict(test_df[['accommodates', 'bathrooms']])

In [69]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[['accommodates', 'bathrooms']], train_df['price'])
predictions = knn.predict(test_df[['accommodates', 'bathrooms']])

In [70]:
from sklearn.metrics import mean_squared_error
two_features_mse = mean_squared_error(test_df.price, predictions)
two_features_rmse = np.sqrt(two_features_mse)
print(two_features_mse, two_features_rmse)

15660.39795221843 125.14151170662127


In [72]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
train_features = train_df[['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']]
train_target = train_df.price
knn.fit(train_features, train_target)
four_predictions = knn.predict(test_df[['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']])

four_mse = mean_squared_error(test_df.price, four_predictions)
four_rmse = four_mse ** (1/2)

print(four_mse, four_rmse)

13320.230625711036 115.41330350402


In [73]:
train_df.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,-0.596544,-0.249467,-0.439151,-0.546858,125.0,-0.341375,-0.016604,4.57965
1593,-0.596544,-0.249467,0.412923,-0.546858,85.0,-0.341375,-0.016603,1.159275
3091,-1.095499,-0.249467,-1.291226,-0.546858,50.0,-0.341375,-0.016573,-0.482505
420,-0.596544,-0.249467,-0.439151,-0.546858,209.0,0.487635,-0.016584,-0.448301
808,4.393004,4.507903,1.264998,2.829956,215.0,-0.065038,-0.016553,0.646219


In [74]:
features = train_df.columns[train_df.columns != 'price']
features

Index(['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights',
       'maximum_nights', 'number_of_reviews'],
      dtype='object')

In [75]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[features], train_df['price'])

all_features_predictions = knn.predict(test_df[features])

all_features_mse = mean_squared_error(test_df.price, all_features_predictions)
all_features_rmse = np.sqrt(all_features_mse)

print(all_features_mse, all_features_rmse)

15455.275631399316 124.31924883701363


Interestingly enough, the RMSE value actually increased to 125.1 when we used all of the features available to us. This means that selecting the right features is important and that using more features doesn't automatically improve prediction accuracy. We should re-phrase the lever we mentioned earlier from:

    * increase the number of attributes the model uses to calculate similarity when ranking the closest neighbors

to:

    * select the relevant attributes the model uses to calculate similarity when ranking the closest neighbors

The process of selecting features to use in a model is known as feature selection.