# AirBnB listings from DC area

In [141]:
import pandas as pd
import numpy as np

In [142]:
dc_listings = pd.read_csv("dc_airbnb.csv")

In [143]:
stripped_commas = dc_listings["price"].str.replace(",", "") #get rid of commas
stripped_dollars = stripped_commas.str.replace("$", "") #get rid of dollar signs
dc_listings["price"] = stripped_dollars.astype("float") #convert string to float
#np.random.permutation(int n) returns an array of n randomly-ordered integers
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))] 

## Using K-Nearest Neighbors to predict the price of AirBnb listing based on its attributes
<br>
-find a few similar listings <br>
-calculate the average price of those listings <br>
-set the average price as the price for our listing

## 1. Without Scikit Learn

1. calculates distance by taking the absolute difference between the accommodates value and new_listing <br>
2. sorts the dataframe in ascending order of distance <br>
3. takes the average of first 5 listings' price (k=5)

In [144]:
def predict_price(new_listing):
    temp_df = dc_listings.copy()
    temp_df["distance"] = temp_df["accommodates"].apply(lambda x: np.abs(new_listing - x))
    temp_df = temp_df.sort_values("distance", ascending=True)
    suggested_price = temp_df.head(5)["price"].mean()
    return(suggested_price)

In [145]:
accommodates_one = predict_price(1)
accommodates_two = predict_price(2)
accommodates_three = predict_price(3)

In [146]:
print(accommodates_one)
print(accommodates_two)
print(accommodates_three)

57.8
113.6
205.6


### Error metrics
Mean Squared Error: (actual1 - predicted1)^2 + (actual2 - predicted2)^2 + .... / n <br>
Root Mean Squared Error: np.sqrt(mean_squared_error)

In [147]:
train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df.copy()
    temp_df["distance"] = temp_df["accommodates"].apply(lambda x: x-new_listing)
    temp_df.sort_values("distance")
    return temp_df.head(5)["price"].mean()
test_df["predicted_prices"] = test_df["accommodates"].apply(lambda x: predict_price(x))
test_df["difference"] = test_df["price"] - test_df["predicted_prices"]
mean_squared_error = test_df["difference"].apply(lambda x: x**2).mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [148]:
mean_squared_error

21887.090053705582

In [149]:
root_mean_squared_error = np.sqrt(mean_squared_error)

In [150]:
root_mean_squared_error

147.94286077302135

## With Scikit Learn
### Using KNeighborsRegressor from neighbors family

In [151]:
from sklearn.neighbors import KNeighborsRegressor

### Remove columns whose values 
1. contain non-numerical values <br>
2. contain non-ordinal values <br>
3. contain too many missing values

### Remove rows who have missing values for the rest of the columns

In [152]:
dc_listings = dc_listings.drop(["room_type", "city", "state", "latitude", "longitude", "zipcode", "host_response_rate", "host_acceptance_rate", "host_listings_count"], axis=1)

In [153]:
dc_listings = dc_listings.drop(["cleaning_fee", "security_deposit"], axis=1)

In [154]:
dc_listings = dc_listings.dropna(axis=0)

 ### Normalize all the columns except for the target column (price column)
x - x.mean() / x.std()

In [155]:
normalized_listings = (dc_listings - dc_listings.mean()) / dc_listings.std()

In [156]:
normalized_listings["price"] = dc_listings["price"]

### Split into train_df and test_df using train_test_split

In [157]:
from sklearn.model_selection import train_test_split

In [158]:
columns = list(normalized_listings.columns)
columns.remove("price")
X = normalized_listings[columns]

In [159]:
Y = normalized_listings["price"]

In [160]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

### Following the general scikit-learn workflow <br>
1. instantiate the specific machine learning model you want to use
2. fit the model to the training data
3. use the model to make predictions
4. evaluate the accuracy of the predictions

In [161]:
knn = KNeighborsRegressor()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_test)

### Error Metrics using Sklearn.metrics

In [162]:
from sklearn.metrics import mean_squared_error

In [163]:
mse = mean_squared_error(predictions, Y_test)

In [164]:
rmse = np.sqrt(mse)

In [165]:
print(mse)
print(rmse)

18340.4812704
135.427033012


## Hyperparameter Optimization
Hyperparameters: values that affect the behavior and performance of a model that are unrelated to the data used (ex: k value in KNeighbors Regressor) <br>
Using grid search: <br>
1. select a subset of the possible hyperparameter values <br>
2. train a model using each of these hyperparameter values <br>
3. evaluate each model's performance <br>
4. select the hyperparameter value that results in the lowest error value 

In [187]:
hyperparameters = [x for x in range(1,21)]
mse_values = list()
features = ["accommodates", "bedrooms", "bathrooms", "number_of_reviews"]

In [188]:
X_hp = normalized_listings[features]
Y_hp = normalized_listings["price"]
X_train_hp, X_test_hp, Y_train_hp, Y_test_hp = train_test_split(X_hp, Y_hp, test_size=0.3) 

In [189]:
for param in hyperparameters:
    knn = KNeighborsRegressor(algorithm="brute", n_neighbors=param)
    knn.fit(X_train_hp, Y_train_hp)
    predictions = knn.predict(X_test_hp)
    mse_values.append(mean_squared_error(predictions, Y_test_hp))

In [190]:
print(np.argmin(mse_values)) #error becomes lowest when k = 5
print(np.min(mse_values))

19
12466.1928652


In [191]:
mse_values

[25175.951905626134,
 20506.639065335752,
 16752.028332325066,
 14199.467729128857,
 13872.252849364791,
 13543.392039725752,
 13664.387625467611,
 13481.72260662432,
 13357.099471219557,
 12981.378393829404,
 12691.601378410405,
 12633.646116908651,
 12611.073191292859,
 12711.540927441756,
 12691.238552127446,
 12662.408387448957,
 12666.469608575788,
 12543.526548251211,
 12515.786708628482,
 12466.192865245008]

### K-fold Cross Validation
1. Split the full dataset into k equal length partitions <br>
2. Select the k-1 partitions as training set and the remaining 1 partition as test set <br>
3. Train the model on the training set
4. Use the trained model to predict labels on the test fold
5. Compute the test fold's error metric
6. Repeat the steps k-1 times until each partition has been used as the test set
7. calculate the mean of the k error values

### Hold-out validation
= 2-fold cross validation

In [192]:
from sklearn.model_selection import cross_val_score, KFold

In [198]:
knn = KNeighborsRegressor()
kf = KFold(n_splits=10, shuffle=True) #usually k=10 is the standard k value
mses = cross_val_score(estimator = knn, X=normalized_listings[features], y=normalized_listings["price"], scoring="neg_mean_squared_error", cv=kf)
rmses = np.sqrt(np.abs(mses))
avg_rmse = rmses.mean()

In [199]:
avg_rmse

114.62880951226593