In [104]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split

# Get the data

In [4]:
listings = pd.read_csv('../data/dc_airbnb.csv')
stripped_commas = listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
listings['price'] = stripped_dollars.astype('float')
listings = listings[['price', 'accommodates', 'bathrooms']]

# Version 1
No evaluation

In [None]:
def predict_price(bedrooms_new_listing, original_data, feature, metric_column_name='euclidean_distance'):
    """
    new_listing int: describes the number of bedrooms
    """
    df = original_data.copy()
    df[metric_column_name] = abs(df[feature] - bedrooms_new_listing)
    df.sort_values(by=[metric_column_name], inplace=True)
    df.reset_index(drop=True, inplace=True)
    predicted_price = df[[prediction_column]].loc[0:k-1][prediction_column].mean()
    return predicted_price

In [None]:
k = 5
bedrooms_new_listing = 3
feature = 'accommodates'
metric_column_name = 'euclidean_distance'
prediction_column = 'price'


acc_one = predict_price(1, listings, feature)
acc_two = predict_price(2, listings, feature)
acc_four = predict_price(4, listings, feature)

# Version 2
* Based on Version 1 but with evaluation
* Predict house prices in test set based on k in train

## Steps
1. Get train + test data
2. Predict
3. Evaluate

## Define functions

In [None]:
def predict_price(new_entry_feature_value):
    # get similary
    df = test_data.copy()
    df[sim_metric] = abs(df[feature_column] - new_entry_feature_value)

    # sort and get price based on top k
    df.sort_values(by=[sim_metric], inplace=True)
    df.reset_index(drop=True, inplace=True)
    predicted_price = round(df.loc[0:k-1][prediction_column].mean())
    
    
    return predicted_price

## Prepare the data

In [None]:
# split the data
train_data, test_data = train_test_split(listings, test_size=0.25)
train_data = train_data.copy()
test_data = test_data.copy()

## Setups

In [None]:
# setups
feature_column = 'accommodates'
new_entry_feature_value = 3
k = 5
sim_metric = 'distance'
prediction_column = 'price'
target_column = 'predicted_price'

## Performn the prediction

In [None]:
test_data[target_column] = test_data[feature_column].apply(predict_price)

## Evaluate the prediction

* Using MAE and MSE

In [None]:
actual_values = test_data[prediction_column]
predicted_values = test_data[target_column]

In [None]:
mae = abs(actual_values - predicted_values).mean()
print(mae)

In [None]:
mse = ((actual_values - predicted_values)**2).mean()
print(mse)

# Run different model using bathrooms as feature

In [61]:
listings.head()

Unnamed: 0,price,accommodates,bathrooms
0,160.0,4,1.0
1,350.0,6,3.0
2,50.0,1,2.0
3,95.0,2,1.0
4,50.0,4,1.0


## Get the data

In [87]:
# split the data
train_data, test_data = train_test_split(listings, test_size=0.25)
train_data = train_data.copy()
test_data = test_data.copy()
train_data_temp = train_data.head().copy()

## Train the model

In [79]:
def predict_price(new_value, similarity_metric_column, data, target_column):
    # calculate similarity metric
    # train_data = data.copy()
    train_data[similarity_metric_column] = train_data[feature_column].apply(lambda x: abs(x - new_value))
    
    # sort by similarity metric
    train_data.sort_values(by=[similarity_metric_column], inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    
    # get mean of k
    predicted_price = train_data.loc[0:k-1, target_column].mean()
    
    
    return predicted_price

In [91]:
def get_name_for_predicted_column(target_column, feature_name):
    return target_column + '_predicted_' + feature_name

In [92]:
k = 5
target_column = 'price'

features = ['bathrooms', 'accommodates']

for feature in features:
    print('using ' + feature)
    
    predicted_column = get_name_for_predicted_column(target_column, feature)
    
    test_data[predicted_column] = test_data[feature_column].apply(
        predict_price
        , args=('euclidean_distance', train_data, target_column)
    )

using bathrooms
using accommodates


In [93]:
test_data.head()

Unnamed: 0,price,accommodates,bathrooms,price_predicted_bathrooms,price_predicted_accommodates
424,329.0,9,2.5,293.4,277.6
3478,94.0,3,1.0,148.8,212.2
812,72.0,2,1.0,88.8,120.6
534,45.0,2,1.0,122.6,117.6
2997,145.0,3,1.0,134.0,149.0


## Evaluate
* MAE
* MSE

In [105]:
evaluations = pd.DataFrame()

for idx, feature_name in enumerate(features):
    actual_values = test_data[target_column]
    predicted_values = test_data[get_name_for_predicted_column(target_column, feature_name)]
    
    
    mse = ((actual_values - predicted_values)**2).mean()
    evaluations.loc[idx, 'feature'] = feature_name
    evaluations.loc[idx, 'mae'] = abs(actual_values - predicted_values).mean()
    evaluations.loc[idx, 'mse'] = mse
    evaluations.loc[idx, 'rmse'] = math.sqrt(mse)