# Readme

* Find model that best predicts the housing prices for dc_airbnb_test/dc_airbnb_train
* Vizualize KNN-neighbors and MSE
* Two features set
    * features variant 1: features = ['accommodates', 'bathrooms']
    * features variant 2: features = ['accommodates', 'bathrooms', 'bedrooms']
* save best KNN-neighbors/MSE combination in dict three_hyp_mse/two_hyp_mse, respectively
* KNN-neighbors from 1 to 20

# Import modules

In [74]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Get data

In [75]:
train_df = pd.read_csv('../data/dc_airbnb_train.csv')
test_df = pd.read_csv('../data/dc_airbnb_test.csv')

# Setup

In [76]:
hyper_params = [x for x in range(1, 21)]
target_column = 'price'

feature_sets = [
    ['accommodates', 'bathrooms']
    ,  ['accommodates', 'bathrooms', 'bedrooms']
]

# Calc mse based on two feature sets and k from 1 to 20

In [94]:
errors = pd.DataFrame()

for features in feature_sets:

    for k in hyper_params:
        knn = KNeighborsRegressor(n_neighbors=k, algorithm='brute')
        knn.fit(train_df[features], train_df[target_column])
        predictions = knn.predict(test_df[features])
        mse = mean_squared_error(test_df[target_column], predictions)

        errors.loc[k, 'k'] = k
        errors.loc[k, '_'.join(features)] = mse

## Get best values

In [89]:
two_features = errors[['k', 'accommodates_bathrooms']].sort_values(by=['accommodates_bathrooms']).reset_index(drop=True)
three_features = errors[['k', 'accommodates_bathrooms_bedrooms']].sort_values(by=['accommodates_bathrooms_bedrooms']).reset_index(drop=True)

In [95]:
two_hyp_mse = {
    two_features.loc[0, 'k']: two_features.loc[0, 'accommodates_bathrooms']
}
three_hyp_mse = {
    three_features.loc[0, 'k']: three_features.loc[0, 'accommodates_bathrooms_bedrooms']
}
print(two_hyp_mse)
print(three_hyp_mse)

{5.0: 14790.314266211606}
{7.0: 13518.769009310208}
