# Preprocessing and Modelling

In [12]:
# import all libraries here

import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

In [13]:
# loading the visualisation dataframe

df_vis = joblib.load('../data/jlib_files/dataframes/complete_df_2_modelling.jlib')

In [14]:
# loading the adapted dataframe for modelling

df = joblib.load('../data/jlib_files/dataframes/modelling_df_2_modelling.jlib')

# Without text variables

### Ridge, Lasso and Elastic Net

In [15]:
X = df.copy()

y = X.pop('price')

X.drop(['name', 'description', 'neighborhood_overview', 'host_about'], axis=1, inplace=True)

In [20]:
# setting up train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, shuffle=True)

In [21]:
X.columns

Index(['host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'instant_bookable', 'calculated_host_listings_count',
       'property_type_basic', 'air_conditioning', 'bbq', 'baby_facilities',
       'balcony_or_patio', 'bath', 'bed_linen', 'cable_tv', 'child_friendly',
       'coffee_maker', 'cooking_facilities', 'dishwasher', 'garden',
       'has_workspace', 'host_greets_you', 'long_term_stays',
       'luggage_dropoff', 'lock_on_bedroom_door', 'luxury_facilities',
       'private_entrance', 'toiletries', 'tumble_dryer', 'tv',
       'distance_from_center', 'station_distance', 'mean_monthly_rent',
       'serviced_property', 'description_provided', 'host_about_provided',
       'name_length', 'description_length', 'neighborhood_overview_length',
       'host_about_length'],
      dtype='object')

In [41]:
# setting up the pipeline to transform the data

categorical_variables = ['host_response_time', 'host_response_rate', 'host_acceptance_rate',
                         'neighbourhood', 'room_type', 'property_type_basic']

# instantiating my transformers

one_hot = OneHotEncoder(sparse=False, drop='first')
scaler = StandardScaler(with_mean=True, with_std=True)

col_trans = ColumnTransformer(
[('dummy', one_hot, categorical_variables)],
remainder='passthrough',
sparse_threshold=0)

model= ElasticNetCV(alphas=np.logspace(-4, 4, 10), 
                     l1_ratio=np.array([0.00001, .1, .5, .7, .9, .95, .99, 1]),
                     cv=5, max_iter=100000)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [42]:
pipe.fit(X_train,y_train)
# cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Elastic CV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
print("Best l1_ratio: {}".format(model.l1_ratio_))

Elastic CV Score: 0.4078694938065287
Best Alpha: 0.046415888336127774
Best l1_ratio: 0.99


In [44]:
model = RidgeCV(alphas=np.logspace(-4, 4, 10),cv=5)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

pipe.fit(X_train,y_train)
# cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("RidgeCV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
# print("Best l1_ratio: {}".format(model.l1_ratio_))

RidgeCV Score: 0.4079238913860872
Best Alpha: 166.81005372000558


In [45]:
model = LassoCV(alphas=np.logspace(-4, 4, 10),cv=5)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

pipe.fit(X_train,y_train)
# cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Elastic CV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
# print("Best l1_ratio: {}".format(model.l1_ratio_))

Elastic CV Score: 0.4078708872518245
Best Alpha: 0.046415888336127774


Using the parameters obtained from the elastic CV search

In [None]:
# setting up the pipeline to transform the data

categorical_variables = ['tfl_zone', 'property_type', 'room_type',
                        'rail_network', 'postcode']

# instantiating my transformers

one_hot = OneHotEncoder(sparse=False,handle_unknown='ignore')
scaler = StandardScaler(with_mean=True, with_std=True)

col_trans = ColumnTransformer(
[('dummy', one_hot, categorical_variables)],
remainder='passthrough',
sparse_threshold=0)

model= ElasticNet(alpha=0.3547,l1_ratio=0.5, max_iter=10000)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_train), x=y_train, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

### Linear Regression with Polynomial Features

In [46]:
from sklearn.preprocessing import PolynomialFeatures

In [47]:
pf = PolynomialFeatures(degree=2, include_bias=True)

In [48]:
pipe = Pipeline(steps = [('col_trans', col_trans),
                         [('pf'), pf],
                        ('scaler', scaler),
                        ('model', model)])

In [49]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.tfl_zone, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.room_type, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.tfl_zone, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.room_type, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

### Decision Tree Regressor

In [None]:
decision_tree = DecisionTreeRegressor(max_depth=5)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('decision_tree', decision_tree)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Random Forest Regressor

In [None]:
random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=40,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

# All non-NLP features

In [None]:
df.neighborhood_overview

In [None]:
df_corr = pd.DataFrame(df.corr()['price'])

df_corr['av_correlation'] = df_corr.price.apply(lambda x: abs(x))

df_corr.sort_values('av_correlation', ascending=False).head(50)

In [None]:
list(df.columns)

In [None]:
variables_discard = ['id','listing_url', 'latitude', 'longitude', 'has_availability', 'availability_60',
                     'availability_30', 'availability_90', 'availability_365', 'number_of_reviews',
                     'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review',
                     'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                      'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                     'review_scores_value','calculated_host_listings_count','calculated_host_listings_count_entire_homes',
                     'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms',
                     'reviews_per_month', 'nearest_station'
                     ]

variables_nlp = ['name', 'description', 'neighborhood_overview', 'host_about']

variables_continuous = ['host_since', 'host_listings_count', 'accommodates', 'bathrooms',
                       'bedrooms', 'beds', 'distance_from_center', 'station_distance',
                       'mean_monthly_rent', 'name_length', 'description_length', 'neighborhood_overview_length',
                       'host_about_length']

variables_dummify = ['neighbourhood', 'property_type', 'room_type', 'rail_network',
                    'tfl_zone', 'postcode']

In [None]:
len(df.columns)

In [None]:
len(variables_continuous+variables_discard+variables_dummify+variables_nlp) 

In [None]:
X = df.copy()
X.drop(variables_discard+variables_nlp, axis=1, inplace=True)

y = X.pop('price')

### Transforming the data

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

one_hot = OneHotEncoder(sparse=False, handle_unknown='ignore',)
scaler = StandardScaler(with_mean=True, with_std=True)

col_trans = ColumnTransformer(
[('dummy', one_hot, variables_dummify)],
remainder='passthrough',
sparse_threshold=0)

model= ElasticNetCV(max_iter=10000)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
# setting up train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, shuffle=True)

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Elastic CV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
print("Best l1_ratio: {}".format(model.l1_ratio_))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

one_hot = OneHotEncoder(sparse=False, handle_unknown='ignore',)
scaler = StandardScaler(with_mean=True, with_std=True)

col_trans = ColumnTransformer(
[('dummy', one_hot, variables_dummify)],
remainder='passthrough',
sparse_threshold=0)

model= ElasticNet(alpha=model.alpha_,max_iter=10000)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Decision Tree Regressor

In [None]:
decision_tree = DecisionTreeRegressor(max_depth=5)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('decision_tree', decision_tree)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

## Random Forest Regressor

In [None]:
random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=50,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.tfl_zone, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.room_type, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.tfl_zone, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

fig, ax = plt.subplots(figsize=(12, 12))
sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.room_type, color='b', ax=ax)
ax.plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')
plt.show()

## Including NLP


### Count Vectorizer

#### Linear Regression

In [None]:
from sklearn.feature_extraction import text

In [None]:
english_stop_words = text.ENGLISH_STOP_WORDS

custom_stop_words = []

for word in english_stop_words:
    custom_stop_words.append(word)
custom_stop_words.append('null')

In [None]:
for column in nlp_columns:
    X[column] = df[column]

In [None]:
# setting up train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, shuffle=True)

### Attempt 1

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Attempt 2

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1,3), min_df=10, max_df=0.95, max_features = 4000)
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Tf-idf Vectorizer

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

tvec = TfidfVectorizer(stop_words=custom_stop_words, ngram_range=(1,1), min_df=10, max_df=0.95)
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('tvec_name', tvec, 'name'),
 ('tvec_description', tvec, 'description'),
 ('tvec_neighbourhood_overview', tvec, 'neighborhood_overview'),
 ('tvec_host_about', tvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5,n_jobs=-2)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

tvec = TfidfVectorizer(stop_words=custom_stop_words, ngram_range=(1,1), min_df=10, max_df=0.95, max_features = 4000)
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('tvec_name', tvec, 'name'),
 ('tvec_description', tvec, 'description'),
 ('tvec_neighbourhood_overview', tvec, 'neighborhood_overview'),
 ('tvec_host_about', tvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5,n_jobs=-2)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Attempt 2

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

tvec = TfidfVectorizer(stop_words=custom_stop_words, ngram_range=(1,2), min_df=10, max_df=0.95, max_features = 4000)
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('tvec_name', tvec, 'name'),
 ('tvec_description', tvec, 'description'),
 ('tvec_neighbourhood_overview', tvec, 'neighborhood_overview'),
 ('tvec_host_about', tvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=5000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5,n_jobs=-2)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

## CountVectorizer Attempt 1 - higher max_features

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=8000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= Lasso(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Attempt 1 - Decision Tree Regressor

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

decision_tree = DecisionTreeRegressor(max_depth=10)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('decision_tree', decision_tree)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=8000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

decision_tree = DecisionTreeRegressor(max_depth=10)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('decision_tree', decision_tree)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Attempt 1 - Random Forest Regressor

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

### Tf-IDF - Random Forest Regressor

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

tvec = TfidfVectorizer(stop_words=custom_stop_words, ngram_range=(1,1), min_df=10, max_df=0.95, max_features = 4000)
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('tvec_name', tvec, 'name'),
 ('tvec_description', tvec, 'description'),
 ('tvec_neighbourhood_overview', tvec, 'neighborhood_overview'),
 ('tvec_host_about', tvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=60,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5,n_jobs=-2)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=50,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=2000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=50, max_df=0.95,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=100, max_df=0.95,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

### Best Score so far!

Grid search the model to get a better score (higher max_depth)

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 2))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(2, 2))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor(
                           n_estimators=100,max_depth=30,n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)])

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

# Grid Search

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

random_forest = RandomForestRegressor()

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('random_forest', random_forest)],verbose=True)

In [None]:
params = {'random_forest__n_estimators': [120],
         'random_forest__max_depth': [30, 60],
         'random_forest__min_samples_leaf': [1, 10],
         'random_forest__max_samples': [None, 0.8],
         'random_forest__max_features': [None, 0.8]}

In [None]:
gs_random_forest = GridSearchCV(pipe, params, cv=5, n_jobs=-2, verbose=2)

In [None]:
gs_random_forest.fit(X_train,y_train)

print("Training Score: {}".format(gs_random_forest.score(X_train,y_train)))
print("Test Score: {}".format(gs_random_forest.score(X_test,y_test)))
print("CV Mean Score: {}".format(gs_random_forest.best_score_))
print("Best Model Parameters: {}".format(gs_random_forest.best_params_))

In [None]:
gs_random_forest.best_estimator_

In [None]:
import joblib

In [None]:
# joblib.dump(pipe, 'pipe_capstone_random_forest.jlib')

In [None]:
# joblib.dump(gs_random_forest.best_estimator_, 'grid_search_capstone_random_forest.jlib')

In [None]:
gs_random_forest = joblib.load('grid_search_capstone_random_forest.jlib')

In [None]:
gs_random_forest.named_steps['random_forest']

## Warm Start Estimator Search

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')


pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('gs_random_forest', gs_random_forest.named_steps['random_forest'])],verbose=True)

In [None]:
params = {'gs_random_forest__n_estimators': [100, 150, 200],
         'gs_random_forest__warm_start': [True]}

In [None]:
gs_random_forest_estimators = GridSearchCV(pipe, params, cv=5, n_jobs=-2, verbose=2)

In [None]:
gs_random_forest_estimators.fit(X_train,y_train)

print("Training Score: {}".format(gs_random_forest_estimators.score(X_train,y_train)))
print("Test Score: {}".format(gs_random_forest_estimators.score(X_test,y_test)))
print("CV Mean Score: {}".format(gs_random_forest_estimators.best_score_))
print("Best Model Parameters: {}".format(gs_random_forest_estimators.best_params_))

In [None]:
# joblib.dump(pipe, 'pipe_capstone_random_forest_estimator.jlib')

In [None]:
# joblib.dump(gs_random_forest_estimators.best_estimator_, 'grid_search_capstone_random_forest_estimator.jlib')

## Grid Search Linear Regression

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= LassoCV(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)

print("Elastic CV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
print("Best l1_ratio: {}".format(model.l1_ratio_))

### Ridge

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.95,max_features=4000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model= RidgeCV(max_iter=10000, random_state=1)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)])

In [None]:
pipe.fit(X_train,y_train)

print("Elastic CV Score: {}".format(pipe.score(X_train,y_train)))
print("Best Alpha: {}".format(model.alpha_))
print("Best l1_ratio: {}".format(model.l1_ratio_))

## Classification

## Re-running best-performing random forest model for analysis

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model = joblib.load('grid_search_capstone_random_forest_estimator.jlib').named_steps['gs_random_forest']
model.set_params(warm_start=False, n_jobs=-2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)],verbose=True)

In [None]:
model.get_params()

In [None]:
pipe.fit(X_train,y_train)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, n_jobs=-3)

print("Training Score: {}".format(pipe.score(X_train,y_train)))
print("Test Score: {}".format(pipe.score(X_test,y_test)))
print("CV Scores: {}".format(cv_scores))
print("CV Mean Score: {}".format(cv_scores.mean()))

In [None]:
# need to get feature names out of the pipe

In [None]:
len(pipe.named_steps['col_trans'].get_feature_names())

In [None]:
variables_dummify

In [None]:
pd.DataFrame(pipe.named_steps['model'].feature_importances_,
             columns=['importance'],
             index=pipe.named_steps['col_trans'].get_feature_names()
             ).sort_values(by='importance', ascending=False
                           ).iloc[:50].plot(kind='barh', figsize=(8, 14))
plt.show()

In [None]:
df[df.name.str.contains('-55%')]

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20, 8),sharey=True)

sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.room_type, ax=ax[0])
ax[0].plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')

ax[0].set(xlabel="True Price", ylabel = "Predicted Price")

sns.scatterplot(y=pipe.predict(X_test), x=y_test,hue=X_test.tfl_zone, color='g', palette='dark', ax=ax[1])
ax[1].plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')

ax[1].set(xlabel="True Price", ylabel = "Predicted Price")

fig.suptitle('Comparison of predicted results in the test set with true values', fontsize=15)
fig.tight_layout()

plt.show()

In [None]:
X_test_predictions = X_test.copy()

X_test_predictions['true_price'] = y_test
X_test_predictions['predicted_price'] = pipe.predict(X_test)
X_test_predictions['residual_values'] = pipe.predict(X_test) - y_test
X_test_predictions['abs_residual_values'] = abs(pipe.predict(X_test) - y_test)

In [None]:
list(X_test.columns)

In [None]:
df.loc[68934]

In [None]:
X_test_predictions[['accommodates','neighbourhood','tfl_zone', 'name', 'description','host_about','true_price','predicted_price',
                   'residual_values', 'abs_residual_values']].sort_values('abs_residual_values',ascending=False).head(10)

In [None]:
df.price.mean()

In [None]:
amenities_values

In [None]:
df[df.name=='MAYFAIR HOUSE - DELUXE & MODERN']

In [None]:
df[df.name=='Cosy home in seven sisters']

In [None]:
df[df.name=='-55% Vibrant Studio Near Holborn Tube Station']

In [None]:
# Plotting the number of listings in each borough
fig1, ax1 = plt.subplots(1, figsize=(15, 6))
neighbourhood_map_df.plot(column='number_of_listings', cmap='Reds', ax=ax1)
ax1.axis('off')
ax1.set_title('Number of Airbnb listings in each London neighbourhood', fontsize=14)
sm = plt.cm.ScalarMappable(cmap='Reds', norm=plt.Normalize(vmin=0, vmax=9000))
sm._A = [] # Creates an empty array for the data range
cbar = fig1.colorbar(sm)
plt.show()

# Plotting the mean price of listings in each borough
fig2, ax2 = plt.subplots(1, figsize=(15, 6))
neighbourhood_map_df.plot(column='mean_price', cmap='Greens', ax=ax2)
ax2.axis('off')
ax2.set_title('Mean price of Airbnb listings in each London neighbourhood', fontsize=14)
sm = plt.cm.ScalarMappable(cmap='Greens', norm=plt.Normalize(vmin=min(neighbourhood_map_df.mean_price), vmax=max(neighbourhood_map_df.mean_price)))
sm._A = [] # Creates an empty array for the data range
cbar = fig2.colorbar(sm)
plt.show()

In [None]:
test_long_lat = X_test_predictions.join(df[['longitude','latitude']],how='left')

In [None]:
test_long_lat

In [None]:
plt.scatter()

In [None]:
X_test_predictions.abs_residual_values.sort_values()

In [None]:
plt.figure(figsize=(10,10))

cmap = sns.cubehelix_palette(as_cmap=True)

sns.scatterplot(x='latitude',y='longitude',data=test_long_lat, hue='abs_residual_values',palette=cmap)

plt.show()

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20, 8),sharey=True)

sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.room_type, ax=ax[0])
ax[0].plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')

ax[0].set(xlabel="True Price", ylabel = "Predicted Price")

sns.scatterplot(y=pipe.predict(X_train), x=y_train,hue=X_train.tfl_zone, color='g', palette='dark', ax=ax[1])
ax[1].plot([df.price.min(), df.price.max()], [
        df.price.min(), df.price.max()], lw=2, c='r')

ax[1].set(xlabel="True Price", ylabel = "Predicted Price")

fig.suptitle('Comparison of predicted results in the training set with true values', fontsize=15)
fig.tight_layout()

plt.show()

In [None]:
residuals = pipe.predict(X_test)-y_test

In [None]:
plt.figure(figsize=(10,8))

plt.hist(residuals, bins=50)
plt.xticks(rotation=45)
plt.title('Residuals Plot',size=15)
plt.show()

In [None]:
# setting up the pipeline to transform the data

# instantiating my transformers

cvec = CountVectorizer(stop_words=custom_stop_words, min_df=10, max_df=0.90,max_features=3000,ngram_range=(1, 1))
one_hot = OneHotEncoder(sparse=True,handle_unknown='ignore')
scaler = StandardScaler(with_mean=False, with_std=True)


col_trans = ColumnTransformer(
[('cvec_name', cvec, 'name'),
 ('cvec_description', cvec, 'description'),
 ('cvec_neighbourhood_overview', cvec, 'neighborhood_overview'),
 ('cvec_host_about', cvec, 'host_about'),
 ('dummy', one_hot, variables_dummify)],
remainder='passthrough')

model = RandomForestRegressor(n_estimators: 150, max_depth=60, max_features=0.8, n_jobs=0.2)

pipe = Pipeline(steps = [('col_trans', col_trans),
                        ('scaler', scaler),
                        ('model', model)],verbose=True)

## Only looking at features (no review information)

This model will only include very basic predictor variables, to get an idea of how well this dataset performs at predicting property prices.

In [None]:
# # instantiating a new dataframe to only look at features

# df_features = df.copy()

In [None]:
# # columns to be removed

# columns_to_drop = null_values.index
# columns_to_drop

In [None]:
# # removing the review features from my dataframe

# df_features.drop(columns_to_drop,axis=1, inplace=True)

In [None]:
# X = df_features.copy()
# X.drop(['longitude', 'latitude'],axis=1,inplace=True)

# y = X.pop('price')

In [None]:
# X.head().T.iloc[:40]

In [None]:
# # columns to one-hot encode

# one_hot_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
#                    'property_type', 'room_type', 'has_availability', 'instant_bookable']

# # columns to countvectorize for NLP

# nlp_columns = ['name', 'description', 'neighborhood_overview', 'host_about']

### Structured Plan

Perform modelling on features without reviews first? Then model including reviews.

Capture metadata aspects about the reviews? HOw many reviews and over which timeframe?

- Create data dictionary - DONE
- Data Cleaning - DONE
- EDA - partial
- Feature Engineering + Further Data Cleaning - partial
- Linear Regression or Classification? - DONE
- Fit Model on Listings Dataset to Predict Prices - DONE
- Fit Model on Reviews Dataset to Predict Prices - DONE
- Combine Both to Predict Prices - DONE
- Visualise findings - use the Tableau location function
- Perform Clustering on the Reviews - what insights can we gather? Create word clouds
- Predict reviews based on NLP of reviews
- What are people looking for when they stay at an Airbnb?
- Which neighborhoods are the most popular? Which are the most expensive?
- Can we see any trends on where people like to stay?
- Are there other features that we can use from different datasets

When transforming data - do train and test split before transforming. This means that your model isn't already aware words that appear in your test set. You need to turn-off drop first, though, and set the parameter to ignore any unknown words.

Notes:

- can we apply the model to other cities?

To-Do List

Data Cleaning:

- use median values rather than mean values (mean values will be swayed more by outliers)
- simplify the categorisation of the property type variable
- apply lower and higher limits to the price variable to deal with outliers
- simplify the amenities + host binarised variables
- create a new column to show the average property price for each host_id
- bring in geographical proximity of attractions as target variables

Variable Transformation:

- look at distributions of continuous/discrete variables - do they need transforming?
- look in to log transforming the continuous variables (naive-Bayes lessons)

Modelling:

- review the use of NLP - could we instead look at key words within the variables? This might be a better option for the title of the 
- can we use neural networks?

good visualisations: https://towardsdatascience.com/predicting-airbnb-prices-with-deep-learning-part-2-how-to-improve-your-nightly-price-50ea8bc2bd29