In [None]:
! pip install -r requirements.txt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import tree

# Data Prep for Modeling

In [None]:
HI_mod = pd.read_csv("./Hawaii_nkm_v1.csv.gz", compression='gzip')
SD_mod = pd.read_csv("./SanDiego_nkm_v1.csv.gz", compression='gzip')

display(HI_mod.head(5))
display(SD_mod.head(5))

### Some Extra Feature Engineering and Data Re-Cleaning

In [None]:
###############################################################################################################################
# Hawaii First
###############################################################################################################################

HI_mod['combo'] = HI_mod[['Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday',
                          'Summer_weekend','Winter_weekday', 'Winter_weekend',]].mean(axis=1)
HI_mod['price'] = HI_mod['price'].replace('[\$,]', '', regex=True).astype(float)
HI_mod[['baths','scrap']] = HI_mod['bathrooms_text'].str.split(" ", 1, expand=True).replace(['Half-bath', 'Private', 'Shared'], '.5')
HI_mod['rev_perunit_perDay'] = HI_mod['combo']*HI_mod['price']
HI_mod['num_amenities'] = HI_mod['amenities'].str.len()

HI_mod = HI_mod.merge(
        HI_mod.groupby(['neighbourhood_group','accommodates']).agg({'price':'mean'})\
        .reset_index().rename(columns={'price':'neighbourhood_avg_price'})
        ,on=['neighbourhood_group','accommodates'] ,how='inner')

###############################################################################################################################
# San Diego Second
###############################################################################################################################


SD_mod['combo'] = SD_mod[['Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday',
                          'Summer_weekend','Winter_weekday', 'Winter_weekend',]].mean(axis=1)
SD_mod['price'] = SD_mod['price'].replace('[\$,]', '', regex=True).astype(float)
SD_mod['rev_perunit_perDay'] = SD_mod['combo']*SD_mod['price']

SD_mod = SD_mod.merge(
        SD_mod.groupby(['neighbourhood']).agg({'price':'mean'})\
        .reset_index().rename(columns={'price':'neighbourhood_avg_price'})
        ,on=['neighbourhood'] ,how='inner')

SD_mod

In [None]:
###############################################################################################################################
# Hawaii First
###############################################################################################################################
HI_mod_gb = HI_mod.groupby(['listing_id','neighbourhood_group','room_type','Occupancy','NoVacancy', 'combo','price','rev_perunit_perDay',
                            'Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend',
                            'Winter_weekday', 'Winter_weekend', 'Fall', 'Spring', 'Summer', 'Winter', 'Weekday', 'Weekend',
                            'accommodates', 'baths', 'bedrooms', 'beds', 'num_amenities', 'neighbourhood_avg_price'
                           ])\
        .agg({'POI_name':pd.Series.nunique}).reset_index()\
        .sort_values(by=['POI_name'], ascending=False).rename(columns={'POI_name':'Num_POI_within_1mile'})
HI_mod_gb


###############################################################################################################################
# San Diego Second
###############################################################################################################################

SD_mod_gb = SD_mod.groupby(['listing_id','neighbourhood','room_type','Occupancy','NoVacancy', 'combo','price','rev_perunit_perDay',
                            'Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend',
       'Winter_weekday', 'Winter_weekend', 'Fall', 'Spring', 'Summer', 'Winter', 'Weekday', 'Weekend','neighbourhood_avg_price'])\
        .agg({'POI_name':pd.Series.nunique}).reset_index()\
        .sort_values(by=['POI_name'], ascending=False).rename(columns={'POI_name':'Num_POI_within_1mile'})
HI_mod_gb

In [None]:
###############################################################################################################################
# Hawaii First
###############################################################################################################################

HI_mod_gb_ = HI_mod_gb
for ele in ['neighbourhood_group','room_type', ]:
    HI_mod_gb_ = pd.get_dummies(HI_mod_gb_, columns=[ele], prefix=[ele] )


###############################################################################################################################
# San Diego Second
###############################################################################################################################

SD_mod_gb_ = SD_mod_gb
for ele in ['neighbourhood','room_type', ]:
    SD_mod_gb_ = pd.get_dummies(SD_mod_gb_, columns=[ele], prefix=[ele] )



# Helper Functions

In [31]:
###############################################################################################################################
# Some code brought in from outside, sourced by nkmartin
###############################################################################################################################

def my_feature_importance(model=None):
    
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    # plt.subplot(1, 2, 2)
    plt.figure(figsize=(8, 18))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.keys()[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.savefig('VariableImportance.png')
    plt.show()
    return plt

def my_quick_profile(df:pd.DataFrame=None, col:str=None):
    
    print(col)
    print('max:',   df[col].max())
    print('q85th:', df[col].quantile(.85))
    print('q75th:', df[col].quantile(.75))
    print('mean:',  df[col].mean())
    print('median:',df[col].median())
    print('q25th:', df[col].quantile(.25))
    print('q15th:', df[col].quantile(.15))
    print('min:',   df[col].min())
    
    return df[col]



# Modeling

### Basic Model - Neighbourhood only 

In [None]:
features = list(set(SD_mod_gb_.columns) - set(['listing_id','Occupancy', 'NoVacancy', 'combo', 'rev_perunit_perDay',
                                 'Fall', 'Spring', 'Summer', 'Winter','Weekday','Weekend','Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend',
                            'Winter_weekday', 'Winter_weekend',]))

X = SD_mod_gb_[features]

y = SD_mod_gb_['NoVacancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)


SD_reg = RandomForestRegressor(random_state=0, n_estimators=100, max_depth= 12, min_samples_split=5).fit(X_train, y_train)
SD_reg.score(X_test, y_test)

In [None]:
my_feature_importance(SD_reg)

### Better Model - Neighbourhood and Unit Specific Variables

In [None]:
features = list(set(HI_mod_gb_.columns) - set(['listing_id','Occupancy', 'NoVacancy', 'combo', 'rev_perunit_perDay',
                                 'Fall', 'Spring', 'Summer', 'Winter','Weekday','Weekend',
                                'Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend','Winter_weekday', 'Winter_weekend'
                                               ]))

X = HI_mod_gb_[features]

y = HI_mod_gb_['NoVacancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)


HI_reg = RandomForestRegressor(random_state=0, n_estimators=100, max_depth= 12, min_samples_split=5).fit(X_train, y_train)
HI_reg.score(X_test, y_test)

In [None]:
my_feature_importance(HI_reg)

### Better Model - Neighbourhood, Unit Specific, and Seasonality

In [None]:
features = list(set(HI_mod_gb_.columns) - set(['listing_id','Occupancy', 'NoVacancy', 'combo', 'rev_perunit_perDay',
                                 'Fall', 'Spring', 'Summer', 'Winter','Weekday','Weekend',
                                 #'Fall_weekday', 'Fall_weekend', 'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend','Winter_weekday', 'Winter_weekend'
                                               ]))

X = HI_mod_gb_[features]

y = HI_mod_gb_['Occupancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)


HI_reg_ = RandomForestRegressor(random_state=0, n_estimators=100, max_depth= 12, min_samples_split=5).fit(X_train, y_train)
HI_reg_.score(X_test, y_test)

In [None]:
my_feature_importance(HI_reg_)

# Grid Search to Tune Parameters

In [None]:
###############################################################################################################################
# Some code brought in from outside, sourced by nkmartin
###############################################################################################################################

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        print("self.params ", self.params )
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores)
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    

### Choose Models and the Parameter Values to Test

In [None]:
###############################################################################################################################
# Please note: The more combinations you add, the longer this cell will run.  Running multiple hours is not uncommon  
###############################################################################################################################
###############################################################################################################################
# Parallelizable version available for cluster use (not needed for this project size)
###############################################################################################################################


models1 = {
#     'LogisticRegression':LogisticRegression(),
#     'ExtraTreesRegressor': ExtraTreesRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
#     'AdaBoostRegressor': AdaBoostRegressor(),
#     'GradientBoostingRegressor': GradientBoostingRegressor()
}

params1 = {
#     'LogisticRegression': {'solver':['lbfgs','liblinear'],'random_state':[0]},
#     'ExtraTreesRegressor': { 'n_estimators': [(i+1)*150  for i in range(10)],'max_depth': [(i+1)*5 for i in range(6)] },
    'RandomForestRegressor': { 'n_estimators': [(i+1)*50 for i in range(3)],'max_depth': [(i+3)*2 for i in range(6)], 'random_state':[0],'min_samples_split':[5, 15, 30] },
#     'AdaBoostRegressor':  { 'n_estimators': [(i+1)*50  for i in range(3)], 'learning_rate': [ 0.1, 0.05, .01] },
#     'GradientBoostingRegressor': { 'n_estimators': [(i+1)*50  for i in range(3)],'min_samples_split':[35, 10, 25], 'max_depth': [(i)*3+7 for i in range(3)], 'learning_rate': [ 0.015, .05],  'random_state':[0] },
    }

### Execute Grid Search - this cell may run very slow based on combinations above

In [None]:
helper = EstimatorSelectionHelper(models1, params1)
helper.fit(X_train, y_train, scoring='r2', refit='r2', n_jobs=-1)
helper.score_summary(sort_by='mean_score').head(10)