In [1]:
import time
from pathlib import Path
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, GridSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
def clean_data(df: pd.DataFrame):
    df = df.drop(columns=['id', 'name', 'host_id', 'host_name'])
    df['last_review'] = pd.to_datetime(df['last_review'], infer_datetime_format=True)

    earliest_dt = min(df['last_review'])
    df = df.fillna({'reviews_per_month': 0, 'last_review': earliest_dt})

    df['last_review'] = df['last_review'].apply(lambda dt: dt.toordinal() - earliest_dt.toordinal())
    
    # one-hot encode categorical data
    df = pd.get_dummies(df)
    
    return df

In [7]:
#Load data
random_state = 91

data_fname = Path("__file__").parent / "data" / "AB_NYC_2019.csv"
df = pd.read_csv(data_fname)
df = clean_data(df)

results = pd.DataFrame(columns=['classifier', 'training_mse', 'test_mse', 'training_r2_score', 'test_r2_score'])

In [8]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

def get_errors_row(classifier, classifier_name):
    y_train_pred = classifier.predict(X_train)
    y_test_pred = classifier.predict(X_test)

    training_mse = mean_squared_error(y_train_pred, y_train)
    test_mse = mean_squared_error(y_test_pred, y_test)
          
    training_r2_score = r2_score(y_train_pred, y_train)
    test_r2_score = r2_score(y_test_pred, y_test)
    
    return pd.DataFrame([{'classifier': classifier_name,
                         'training_mse': training_mse,
                         'test_mse': test_mse,
                         'training_r2_score': training_r2_score,
                         'test_r2_score': test_r2_score}])

In [9]:
# Single decision tree

tree_classifier = tree.DecisionTreeClassifier(random_state=random_state)
tree_classifier.fit(X_train, y_train)

results = results.append(get_errors_row(tree_classifier, 'Decision tree'), ignore_index=True)

In [10]:
# Random forest

rfr_baseline = RandomForestRegressor(random_state=random_state)
rfr_baseline.fit(X_train, y_train)

results = results.append(get_errors_row(rfr_baseline, 'Random forest'), ignore_index=True)

In [11]:
## Random Forest w/ Grid Search
num_folds = 5

param_grid = {'n_estimators': list(range(2, 100, 25)),
              'max_depth': list(range(2, 20, 2))}

# runs for ~ 11 mins
rf = RandomForestRegressor(random_state=random_state)
rf_grid = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       n_jobs=-1)

rf_grid.fit(X_train, y_train)

results = results.append(get_errors_row(rf_grid, 'Random forest Grid Search'), ignore_index=True)

In [13]:
## Random forest w/ Grid Search & CV

num_folds = 5

# param_grid = {'n_estimators': num_estimators,
#               'max_depth': num_depths}

kf = KFold(n_splits=num_folds, shuffle=True)
# rfr_ = RandomForestRegressor(n_estimators=27, max_depth=10, random_state=91)
rfr_ = rf_grid.best_estimator_

for train_index, test_index in kf.split(X):
    X_train_, X_test_ = X.loc[train_index], X.loc[test_index]
    y_train_, y_test_ = y.loc[train_index], y.loc[test_index]
    
    rfr_.fit(X_train_, y_train_)
    
results = results.append(get_errors_row(rfr_, 'Random forest (Grid Search and CV)'), ignore_index=True)

In [14]:
results

Unnamed: 0,classifier,training_mse,test_mse,training_r2_score,test_r2_score
0,Decision tree,0.056812,82596.299591,0.999999,-0.919234
1,Random forest,9722.671912,55064.873784,0.718828,-2.234168
2,Random forest Grid Search,22846.64922,51186.10173,-0.113996,-3.892626
3,Random forest (Grid Search and CV),28845.671397,25803.340619,-0.557187,-0.59541


In [None]:
## Feature importance
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=random_state)

forest.fit(X, y)

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
    
top_10 = sorted([(X.columns[indices[i]], importances[indices[i]]) for i in range(X.shape[1])],
                key=lambda x: x[1],
                reverse=True)[:10]

for i, val in top_10:
    print("%-30s %f" % (i, val))