# 2. MULTIVARIARE K-NEAREST NEIGHBORS
---

## 1. Introducing the Data

In [1]:
import pandas as pd
import numpy as np

file = 'data/dc_airbnb.csv'
cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 
        'minimum_nights', 'maximum_nights', 'number_of_reviews']
dc = pd.read_csv(file)
dc = dc[cols]
dc.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,4,1.0,1.0,2.0,$160.00,1,1125,0
1,6,3.0,3.0,3.0,$350.00,2,30,65
2,1,1.0,2.0,1.0,$50.00,2,1125,1
3,2,1.0,1.0,1.0,$95.00,1,1125,0
4,4,1.0,1.0,1.0,$50.00,7,1125,0


In [2]:
dc.shape

(3723, 8)

In [30]:
dc.describe()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,minimum_nights,maximum_nights,number_of_reviews
count,3723.0,3702.0,3696.0,3712.0,3723.0,3723.0,3723.0
mean,3.195004,1.210157,1.256358,1.643319,2.250067,580306.9,15.306742
std,2.012216,0.839851,0.585539,1.182117,3.622879,35195520.0,29.645586
min,1.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,1.0,120.0,1.0
50%,2.0,1.0,1.0,1.0,2.0,1125.0,4.0
75%,4.0,1.0,1.0,2.0,3.0,1125.0,16.0
max,16.0,10.0,8.0,16.0,180.0,2147484000.0,362.0


In [3]:
dc.dtypes

accommodates           int64
bedrooms             float64
bathrooms            float64
beds                 float64
price                 object
minimum_nights         int64
maximum_nights         int64
number_of_reviews      int64
dtype: object

## 2. Train-Test Splitting

In [26]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dc, test_size=0.2, random_state=42)
train.shape, test.shape

((2978, 8), (745, 8))

## 3. Cleaning Price Column

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

class PriceCleaner(BaseEstimator, TransformerMixin):    
    def __init__(self):
        self  
    def fit(self, df, y = None):               
        return self
    def transform(self, df):               
        df = df.copy()
        no_comma = df['price'].str.replace(',', '')
        no_dollar = no_comma.str.replace('$', '')
        df['price'] = no_dollar.astype('float')
        return df
    
pricer = PriceCleaner()
train_clean = pricer.transform(train)
train_clean.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
2196,4,1.0,1.0,2.0,129.0,2,1125,15
209,4,1.0,1.0,1.0,295.0,1,1125,20
3368,2,1.0,1.0,1.0,140.0,1,1125,9
2771,5,2.0,2.0,3.0,217.0,3,730,19
1588,6,2.0,1.0,2.0,150.0,2,35,15


In [28]:
train_clean.describe()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
count,2978.0,2957.0,2957.0,2969.0,2978.0,2978.0,2978.0,2978.0
mean,3.203828,1.210348,1.256848,1.640283,150.057085,2.291807,4172.699,15.553056
std,2.028788,0.841491,0.58646,1.156521,146.110268,3.97521,183242.1,29.587263
min,1.0,0.0,0.0,1.0,20.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,85.0,1.0,120.0,1.0
50%,2.0,1.0,1.0,1.0,115.0,2.0,1125.0,4.0
75%,4.0,1.0,1.0,2.0,165.0,3.0,1125.0,17.0
max,16.0,10.0,8.0,16.0,2822.0,180.0,9999999.0,338.0


In [29]:
train_clean['price'].dtypes

dtype('float64')

## 4. Imputing Missing Data

In [44]:
train_clean.isnull().mean().sort_values(ascending=True)

accommodates         0.000000
price                0.000000
minimum_nights       0.000000
maximum_nights       0.000000
number_of_reviews    0.000000
beds                 0.003022
bedrooms             0.007052
bathrooms            0.007052
dtype: float64

In [45]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
imputer.fit(train_clean)
train_imputed = imputer.transform(train_clean)
pd.DataFrame(train_imputed).isnull().mean()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
dtype: float64

## 4. Data Standardization

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_imputed)
train_scaled = pd.DataFrame(train_scaled)
train_scaled.columns = cols
train_scaled.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.392503,-0.249075,-0.436195,0.313088,-0.144142,-0.073419,-0.016635,-0.018695
1,0.392503,-0.249075,-0.436195,-0.552628,0.992177,-0.32502,-0.016635,0.150325
2,-0.593473,-0.249075,-0.436195,-0.552628,-0.068844,-0.32502,-0.016635,-0.22152
3,0.885491,0.943443,1.274127,1.178805,0.458244,0.178182,-0.018791,0.116521
4,1.378479,0.943443,-0.436195,0.313088,-0.000391,-0.073419,-0.022584,-0.018695


In [47]:
scaler.mean_

array([3.20382807e+00, 1.20886501e+00, 1.25503694e+00, 1.63834788e+00,
       1.50057085e+02, 2.29180658e+00, 4.17269879e+03, 1.55530557e+01])

In [48]:
scaler.scale_

array([2.02844748e+00, 8.38561475e-01, 5.84685184e-01, 1.15511254e+00,
       1.46085734e+02, 3.97454206e+00, 1.83211282e+05, 2.95822953e+01])

In [49]:
scaler.n_samples_seen_

2978

In [50]:
train_scaled.describe()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
count,2978.0,2978.0,2978.0,2978.0,2978.0,2978.0,2978.0,2978.0
mean,9.081609e-17,-4.3320320000000005e-17,1.521431e-16,9.357488000000001e-17,8.240927000000001e-17,2.654768e-16,-4.012605e-16,-4.615366e-17
std,1.000168,1.000168,1.000168,1.000168,1.000168,1.000168,1.000168,1.000168
min,-1.086461,-1.441594,-2.146517,-0.5526283,-0.8902792,-0.3250202,-0.02276988,-0.5257555
25%,-0.5934726,-0.2490754,-0.4361953,-0.5526283,-0.445335,-0.3250202,-0.02212036,-0.4919515
50%,-0.5934726,-0.2490754,-0.4361953,-0.5526283,-0.2399761,-0.07341892,-0.01663489,-0.3905395
75%,0.3925031,-0.2490754,-0.4361953,0.3130882,0.1022887,0.1781824,-0.01663489,0.04891251
max,6.308358,10.48359,11.53606,12.43312,18.29024,44.71161,54.55901,10.9


## 5. Training KNN on All Features

In [52]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

X_train = train_scaled.drop('price', axis=1)
y_train = train_scaled['price']

knn = KNeighborsRegressor(algorithm='brute')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print('MSE_train:', mse)
print('RMSE_train:', rmse)

MSE_train: 0.4822721804508388
RMSE_train: 0.6944581920107493


In [37]:

test_clean = pricer.transform(train)
train_clean.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,minimum_nights,maximum_nights,number_of_reviews
0,0.392503,-0.250013,-0.438038,0.311086,-0.073419,-0.016635,-0.018695
1,0.392503,-0.250013,-0.438038,-0.553722,-0.32502,-0.016635,0.150325
2,-0.593473,-0.250013,-0.438038,-0.553722,-0.32502,-0.016635,-0.22152
3,0.885491,0.938555,1.267396,1.175894,0.178182,-0.018791,0.116521
4,1.378479,0.938555,-0.438038,0.311086,-0.073419,-0.022584,-0.018695


In [38]:
y_train.head()

0   -0.144142
1    0.992177
2   -0.068844
3    0.458244
4   -0.000391
Name: price, dtype: float64

In [39]:
X_train.shape, y_train.shape

((2978, 7), (2978,))