In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeRegressor

# Load the dataset
df = pd.read_csv('airbnb_sample_landmarks_zipcodes.csv')

In [9]:
df.shape

(25000, 92)

In [85]:
# check for missing values
df.isna().sum()

id                              0
last_scraped                    0
latitude                        0
longitude                       0
accommodates                    0
                             ... 
num_landmarks_dist_500m         0
closest_landmark                0
mean_log_dist_landmarks         0
median_log_dist_landmarks       0
zip_code                     2212
Length: 92, dtype: int64

In [135]:
# drop missing values
df.dropna(inplace=True)

In [136]:
# remove all transformed variables from the dataframe
df = df.loc[:,~df.columns.str.contains('log')]

In [137]:
#drop unneeded columns
df = df.drop(columns=['id', 'last_scraped', 'latitude', 'longitude', 'borough'])

In [138]:
# convert to category datatype
df['closest_landmark'] = df['closest_landmark'].astype('category')
df['zip_code'] = df['zip_code'].astype('category')

In [139]:
# get dummies for the closet_landmark
df = pd.get_dummies(df)

In [140]:
# train test split
xTrain, xTest, yTrain, yTest = train_test_split(df.drop(columns=['price']), df.price, test_size=0.2, random_state=42)

#### Random Forest Model

In [141]:
rForest = RandomForestRegressor(random_state=42)
rForest_cv = cross_validate(rForest,xTrain,yTrain, cv = 5, scoring = ['neg_root_mean_squared_error','r2'])

In [142]:
print('------Random Forest------')
print(f'Mean R-Squared: {rForest_cv['test_r2'].mean()}')
print(f'Mean RSME: {-rForest_cv['test_neg_root_mean_squared_error'].mean()}')

------Random Forest------
Mean R-Squared: 0.26158165122022925
Mean RSME: 340.9826379449663


#### Decision Tree Model

In [143]:
decisionTree = DecisionTreeRegressor(random_state=5280)
decisionTree_cv = cross_validate(decisionTree,xTrain,yTrain, cv = 5, scoring = ['neg_root_mean_squared_error','r2'])

In [144]:
print('------Decision Tree------')
print(f'Mean R-Squared: {decisionTree_cv['test_r2'].mean()}')
print(f'Mean RSME: {-decisionTree_cv['test_neg_root_mean_squared_error'].mean()}')

------Decision Tree------
Mean R-Squared: -2.0304076661673434
Mean RSME: 569.5276251842519
