In [None]:
import pandas as pd
import numpy as np
from src.helpers import build_data_frame
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
import shap
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
listings = build_data_frame()

In [None]:
listings.head()

In [None]:
listings = listings[listings['price'] < 500]

In [None]:
# drop_cols = ['latitude', 
#              'longitude',
#              'neighbourhood_cleansed_Havering',
#              '"Espresso machine"',
#              'neighbourhood_cleansed_Bexley',
#              'neighbourhood_cleansed_Hillingdon',
#              '"Security system"',
#              '"Balcony"',
#              '"Dining area for 8 people"',
#              '"Terrace"',
#              '"Chef\'s kitchen"',
#              '"Ironing board"']
# for col in drop_cols:
#     try:
#         listings.pop(col)
#     except:
#         continue

In [None]:
listings.head()

In [None]:
listings.info()

In [None]:
y = listings['price']
listings_copy = listings.copy()
X = listings_copy.drop('price', axis=1)
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=.2, shuffle=True, random_state=0)
X_train.shape, X_test.shape

In [None]:
rf = RandomForestRegressor(max_depth=None,
                          max_features='sqrt',
                          n_estimators=1000,
                          random_state=1,
                          n_jobs=-1)

rf.fit(X_train, y_train)

In [None]:
# get importance
importance = rf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
rf_test_pred = rf.predict(X_test)
#print('Test MSE:', round(mean_squared_error(y_test, rf_test_pred),2))
print('Test RMSE: $', round(np.sqrt(mean_squared_error(y_test, rf_test_pred)),2))

In [None]:
# rf 
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
fig, ax = plt.subplots()
ax.plot(y_test, rf_test_pred, '.')
ax.set_xlabel('Target')
ax.set_ylabel('Prediction')

In [None]:
plt.figure(figsize=(10, 8))
plt.plot([0,400], [0,400], c='navy')
plt.scatter(rf_test_pred, y_test, color='deepskyblue', alpha=0.4)
sns.despine()
plt.xticks(np.arange(0, 500, 100))
plt.ylabel('Actual Price', size=15)
plt.xlabel('Predicted Price', size=15)
plt.title('Random Forest', size=18);

In [None]:
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.02, .03], #so called `eta` value
              'max_depth': [6, 7, 8],
              'min_child_weight': [3, 4],
              'silent': [1],
              'subsample': [0.6, 0.7],
              'colsample_bytree': [0.6, 0.7],
              'n_estimators': [300, 500]}


In [None]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = -1)

In [None]:
# xgb_grid.fit(X_train, y_train)

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
gbm = xgb.XGBRegressor(colsample_bytree= 0.7,
                       n_estimators=500,
                       max_depth=7,
                       objective="reg:squarederror",
                       learning_rate=.03, 
                       subsample=0.8,
                       min_child_weight=4,
                       nthread= 4,
                       silent= 1)
    
eval_set=[(X_train,y_train),(X_test,y_test)]

xgb_reg = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='rmse',
                    early_stopping_rounds=20)

In [None]:
xgb_test_pred = xgb_reg.predict(X_test)
print('Test MSE:', round(mean_squared_error(y_test, xgb_test_pred),4))
print('Test RMSE:', round(np.sqrt(mean_squared_error(y_test, xgb_test_pred)),4))

In [None]:
fig, ax = plt.subplots()
ax.plot(y_test, xgb_test_pred, '.')
ax.set_xlabel('Target')
ax.set_ylabel('Prediction')

In [None]:
plt.figure(figsize=(10, 8))
plt.plot([0,400], [0,400], c='navy')
plt.scatter(xgb_test_pred, y_test, color='deepskyblue', alpha=0.4)
sns.despine()
plt.xticks(np.arange(0, 500, 100))
plt.ylabel('Actual Price', size=15)
plt.xlabel('Predicted Price', size=15)
plt.title('XGBoost', size=18);

In [None]:
lr = LinearRegression() 

lr.fit(X_train, y_train)

lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

print(f'Linear Regression train R^2: {lr.score(X_train, y_train)}')
print(f'Linear Regression val R^2: {lr.score(X_test, y_test)}')

#print("\nTraining MSE:", round(mean_squared_error(y_train, lr_train_pred),4))
print("\n \nTest MSE:", round(mean_squared_error(y_test, lr_test_pred),4))

# print("\nTraining RMSE:", np.sqrt(mean_squared_error(y_train, train_pred),4))
print("Test RMSE:", round(np.sqrt(mean_squared_error(y_test, lr_test_pred)),4))

In [None]:
plt.figure(figsize=(10, 8))
plt.plot([0,400], [0,400], c='navy')
plt.scatter(lr_test_pred, y_test, color='deepskyblue', alpha=0.4)
sns.despine()
plt.xticks(np.arange(0, 500, 100))
plt.ylabel('Actual Price', size=15)
plt.xlabel('Predicted Price', size=15)
plt.title('Linear Regression', size=18);
#plt.savefig('actual_plot', bbox_inches ='tight', dpi = 400)

In [None]:
coef_list = sorted(list(zip(X_train.columns, lr.coef_)), key= lambda x : x[1])

x_values = [x[0] for x in coef_list]
y_values = [y[1] for y in coef_list]

fig, ax = plt.subplots(figsize=(15,15))
ax = sns.barplot(x=y_values, y=x_values, palette=("husl"))
ax.set_title('Regression Coefficients', size=18)
ax.set_xlabel('Price', size=15)
ax.set_ylabel('Features', size=15)
#plt.savefig('features', bbox_inches ='tight', dpi = 400)

In [None]:
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importance)

# Plot the feature importances of the forest
fig, ax = plt.subplots(figsize=(15,15))
ax.set_title("Feature importances", size=18)
ax = sns.barplot(x=y_values, y=x_values, palette=("husl"))
ax = sns.barh(range(X.shape[1]), importance[indices],
       color="r", xerr=std[indices], align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(X_train.shape[1]), indices)
plt.ylim([-1, X_train.shape[1]])
plt.show()

In [None]:
df[df['price'] < 150].shape[0] / df['price'].shape[0] * 100