### Import data

In [1]:
from sklearn.externals import joblib

In [2]:
features_og = joblib.load('../Predicting_Airbnb_Prices/features_og.pickle')
features_transformed = joblib.load('../Predicting_Airbnb_Prices/features_transformed.pickle')
target = joblib.load('../Predicting_Airbnb_Prices/target.pickle')

In [3]:
print(features_og.shape)
print(features_transformed.shape)
print(target.shape)

(31253, 11)
(31253, 18)
(31253,)


### 4. Train-test-split data

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features_transformed, target, test_size = .3, random_state = 42)

### 5. Import algorithm

In [6]:
# Try a blank benchmark (when you just pick the most frequent class - probability)
from sklearn.dummy import DummyRegressor
dc = DummyRegressor()
dc.fit(X_train, y_train)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [7]:
print('Dummy Regressor train score: {}'.format(dc.score(X_train, y_train)))
print('Dummy Regressor test score: {}'.format(dc.score(X_test, y_test)))

Dummy Regressor train score: 0.0
Dummy Regressor test score: -0.00010857855372292669


In [8]:
# Try a base KNeighborsRegressor:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [9]:
print('KNeighbors Regressor train score: {}'.format(knr.score(X_train, y_train)))
print('KNeighbors Regressor test score: {}'.format(knr.score(X_test, y_test)))

KNeighbors Regressor train score: 0.5009947211744834
KNeighbors Regressor test score: 0.3818596671133175


In [10]:
# Use our trained KNeighborsRegressor to predict the value for our scaled datapoint
test_preds = knr.predict(X_test)
test_preds[:5]

array([  55.8,   80.6,  330.4,   66. ,  127.2])

In [11]:
# Compare it to our actual values in the test set
y_test[:5]

13520     25
21367    115
27533    135
14999     69
24357    130
Name: price, dtype: int64

The mean_squared_error() function takes in two inputs:

* A list-like object, representing the true values.
* A second list-like object, representing the predicted values using the model.

The **root mean squared error (rmse)** penalizes predicted values that are further away from the actual value much more than those that are closer to the actual value. Therefore, the larger the number, the greater the error. 
* The units for RMSE are the same as the value we are predicting, which makes it easy to understand the scale of our error.

In [37]:
# How good was our prediction? Our rmse is about $349.
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, test_preds)
rmse = mse ** (1/2)
print(rmse)

349.467685292
