# Readme

Uses scikit-learn to use KNN to predict housing prices

***

First two columns, but mostly hardcoded
Then, four columns but parametrized

# General code

## Import

In [1]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

## Get data (train + test)

In [2]:
normalized_listings = pd.read_csv('../data/normalized_listings.csv')
normalized_listings.info()
normalized_listings.describe()

train_df, test_df = train_test_split(normalized_listings, random_state=1, train_size=0.760555706891855)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3671 entries, 0 to 3670
Data columns (total 9 columns):
Unnamed: 0           3671 non-null int64
accommodates         3671 non-null float64
bedrooms             3671 non-null float64
bathrooms            3671 non-null float64
beds                 3671 non-null float64
price                3671 non-null float64
minimum_nights       3671 non-null float64
maximum_nights       3671 non-null float64
number_of_reviews    3671 non-null float64
dtypes: float64(8), int64(1)
memory usage: 258.2 KB


***

# Hardcoded version with two features

## Fit the model for two features

In [3]:
# setups
feature_columns = ['accommodates', 'bathrooms']
target_column = 'price'

In [4]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

knn.fit(train_df[feature_columns], train_df[target_column])

KNeighborsRegressor(algorithm='brute')

## Predict

In [5]:
predicted_values = knn.predict(test_df[feature_columns])

## Evalute

In [6]:
true_values = test_df[target_column]

two_features_mse = mean_squared_error(true_values, predicted_values)
two_features_rmse = (mean_squared_error(true_values, predicted_values))**(1/2)
two_features_mae = round(mean_absolute_error(true_values, predicted_values), 2)


assert two_features_rmse == mean_squared_error(true_values, predicted_values, squared=False)

# mae_manuel = round(sum(np.abs(true_values - predicted_values))/len(predicted_values), 2)
# assert mae_manuel == mae


print(two_features_mse)
print(two_features_rmse)
print(two_features_mae)

13074.825620022753
114.34520374734899
60.32


***

# Parametrized version
* KNN with k = 5
* algo = 'brute'
* features = ['accommodates, 'bedrooms', 'bathrooms', 'number_of_reviews']
* target_column = 'price'
* assign predicted price to four_predictions
* calc mse + rmse

In [7]:
def get_eval(train_data, test_data, feature_columns, target_column):
    train_df = train_data.copy()
    test_df = test_data.copy()
    
    knn = KNeighborsRegressor(n_neighbors=5, algorithm = 'brute')
    knn.fit(train_df[feature_columns], train_df[target_column])
    four_predictions = knn.predict(test_df[feature_columns])
    
    mse = mean_squared_error(test_df[target_column], four_predictions)
    return mse,  mse**(1/2)

In [8]:
target_column = 'price'

## Predict and compare

### Four Features

In [9]:
mse, rmse = get_eval(train_df, test_df, ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews'], target_column)
    
print(mse, rmse)

12668.449647326508 112.55420759494737


### One feature

In [10]:
mse, rmse = get_eval(train_df, test_df, ['accommodates'], target_column)
    
print(mse, rmse)

15169.445733788392 123.16430381319253


### One features

In [11]:
mse, rmse = get_eval(train_df, test_df, ['bathrooms'], target_column)
    
print(mse, rmse)

13440.215199089875 115.93194210005228


### Two features

In [12]:
mse, rmse = get_eval(train_df, test_df, ['accommodates', 'bathrooms'], target_column)
    
print(mse, rmse)

13074.825620022753 114.34520374734899


### All features

In [18]:
features = train_df.columns.tolist()
features.remove('price')
features.remove('Unnamed: 0')

mse, rmse = get_eval(train_df, test_df, features, target_column)
    
print(mse, rmse)

11195.285415244598 105.80777577874227
