In [2]:
import pandas as pd
import os
import seaborn as sns
from time import time
import pickle
path = 'data/'

df = pd.read_csv(os.path.join(path,'airbnb_listings_usa_cycle_1.csv'))
# df=df[:10000]

In [3]:
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 100)

In [4]:
# 3. Arrange X features matrix & y target vector

target = ['price']

features = ['host_response_time','host_response_rate','host_acceptance_rate',
'street','neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed','city','state','zipcode','market','smart_location','latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','amenities','square_feet','minimum_nights','maximum_nights','instant_bookable','is_business_travel_ready','cancellation_policy','require_guest_profile_picture','require_guest_phone_verification','notes_len','transit_len','access_len','interaction_len','house_rules_len','host_about_len','metro_area','bedrooms_str','beds_str']

# Wrangle and pre-process

# Removing sparse features from features (sparse features are <90% populated)
sparse_features = ['square_feet','neighbourhood_group_cleansed','host_response_rate','host_response_time','neighbourhood','host_acceptance_rate']

unusable_features = ['amenities']

duplicative_location_features = ['street','neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed','city','state','zipcode','market','smart_location','metro_area']

numeric_columns = df.dtypes[df.dtypes==int].index.tolist()
nonnumeric_columns = df.dtypes[df.dtypes==object].index.tolist()

df[nonnumeric_columns] = df[nonnumeric_columns].astype(str)

selected_features = list(set(features) - set(sparse_features))
selected_features = list(set(selected_features) - set(unusable_features))
selected_features = list(set(selected_features) - set(duplicative_location_features))

df = df[target + selected_features]

df.dropna(inplace=True)

In [5]:
y = df[target]
X = df[selected_features]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [7]:
# We need to encode features!

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import category_encoders as ce

In [8]:
X_train.shape

(184760, 24)

In [9]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

encoder = ce.OneHotEncoder(use_cat_names=True)
imputer = SimpleImputer()
# scaler = StandardScaler()
model = LinearRegression()

X_train_encoded = encoder.fit_transform(X_train)
X_train_imputed = imputer.fit_transform(X_train_encoded)
model.fit(X_train_imputed, y_train)

X_test_encoded = encoder.transform(X_test)
X_test_imputed = imputer.transform(X_test_encoded)
y_pred = model.predict(X_test_imputed)

mae = mean_absolute_error(y_test, y_pred)

print(f'mae ${mae:,.0f}')
# print(y_pred)



mae $179


Decision trees


In [20]:
# TODO
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

# These are the only two changes from the previous cell:
# Remove StandardScaler (it's not needed or helpful for trees)
# Change the model from LogisticRegression to DecisionTreeClassifier

pipeline_decision_tree = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='mean'), 
    DecisionTreeRegressor(random_state=42)
)

# Fit on train
pipeline_decision_tree.fit(X_train, y_train)

# Score on train, val
print('Train Accuracy', pipeline_decision_tree.score(X_train, y_train))
print('Test Accuracy', pipeline_decision_tree.score(X_test, y_pred),"does not seem right")

# Predict on test
y_pred = pipeline_decision_tree.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')

Train Accuracy 0.9999730020564809
Test Accuracy 1.0 does not seem right
mae $134


In [11]:
# TODO
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

# These are the only two changes from the previous cell:
# Remove StandardScaler (it's not needed or helpful for trees)
# Change the model from LogisticRegression to DecisionTreeClassifier

pipeline_random_forest_regressor = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='mean'), 
    RandomForestRegressor(random_state=42)
)

# Fit on train
pipeline_random_forest_regressor.fit(X_train, y_train)

# Score on train, val
print('Train Accuracy', pipeline_random_forest_regressor.score(X_train, y_train))
print('Test Accuracy', pipeline_random_forest_regressor.score(X_test, y_pred),"does not seem right")

# Predict on test
y_pred = pipeline_random_forest_regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')

Train Accuracy 0.9504158997545168
Test Accuracy 0.645565322224587 does not seem right
mae $105


Save and load decision tree

In [26]:
# Save decision tree 
with open('decision_tree_model.pkl','wb') as model_file:
    pickle.dump(pipeline_decision_tree, model_file)

# 20.1MB file size    

In [27]:
# load model
with open('decision_tree_model.pkl','rb') as model_file:
    loaded_model = pickle.load(model_file)

In [28]:
loaded_model.predict(X_train.iloc[:2,:])


array([100.  ,  67.25])

Save and load random forest

In [22]:
# Save random forest 
with open('random_forest_regressor.pkl','wb') as model_file:
    pickle.dump(pipeline_random_forest_regressor, model_file)

# 20.1MB file size    

In [23]:
# load model
with open('random_forest_regressor.pkl','rb') as model_file:
    loaded_model = pickle.load(model_file)

In [24]:
loaded_model.predict(X_train.iloc[:2,:])

array([100.  ,  67.25])

In [16]:
X_test.head()

Unnamed: 0,house_rules_len,bathrooms,maximum_nights,is_business_travel_ready,bed_type,longitude,property_type,minimum_nights,beds_str,host_about_len,require_guest_profile_picture,accommodates,require_guest_phone_verification,instant_bookable,latitude,bedrooms,notes_len,interaction_len,room_type,transit_len,access_len,bedrooms_str,beds,cancellation_policy
169519,3,1.0,1125,f,Real Bed,-73.95477,Apartment,5,2,3,f,4,f,t,40.76775,2.0,3,125,Entire home/apt,3,257,2,2.0,flexible
102793,1000,2.5,180,f,Real Bed,-118.64547,House,30,3,16,f,8,f,f,34.22218,5.0,3,3,Entire home/apt,219,3,5,3.0,strict_14_with_grace_period
69294,3,1.0,1125,f,Real Bed,-74.05285,Apartment,1,4,21,f,8,f,f,40.74043,3.0,109,51,Entire home/apt,3,22,3,4.0,flexible
212757,36,1.0,1125,f,Real Bed,-122.30712,Apartment,2,2,227,f,4,f,t,47.61876,1.0,3,212,Entire home/apt,3,3,1,2.0,flexible
194147,3,3.0,14,f,Real Bed,-117.16584,House,1,1,3,f,2,f,t,32.723,1.0,3,3,Private room,3,3,1,1.0,flexible


In [17]:
X_train[:2,:]

TypeError: '(slice(None, 2, None), slice(None, None, None))' is an invalid key

Possible next steps:

Cross validation
RandomizedSearchCV
GridSearchCV



In [None]:
References

** Dec 24 ** MVP #2.
Decision tree on own-selected few features.



In [32]:
selected_features

['house_rules_len',
 'bathrooms',
 'maximum_nights',
 'is_business_travel_ready',
 'bed_type',
 'longitude',
 'property_type',
 'minimum_nights',
 'beds_str',
 'host_about_len',
 'require_guest_profile_picture',
 'accommodates',
 'require_guest_phone_verification',
 'instant_bookable',
 'latitude',
 'bedrooms',
 'notes_len',
 'interaction_len',
 'room_type',
 'transit_len',
 'access_len',
 'bedrooms_str',
 'beds',
 'cancellation_policy']

In [34]:
df[selected_features].head()

Unnamed: 0,house_rules_len,bathrooms,maximum_nights,is_business_travel_ready,bed_type,longitude,property_type,minimum_nights,beds_str,host_about_len,require_guest_profile_picture,accommodates,require_guest_phone_verification,instant_bookable,latitude,bedrooms,notes_len,interaction_len,room_type,transit_len,access_len,bedrooms_str,beds,cancellation_policy
0,3,1.0,7,f,Real Bed,-97.69477,House,1,1,3,f,2,f,f,30.22302,1.0,3,110,Private room,378,186,1,1.0,moderate
1,211,2.0,14,f,Real Bed,-97.78217,House,1,2,3,f,8,f,f,30.452,2.0,44,3,Entire home/apt,3,3,2,2.0,moderate
2,3,1.0,7,f,Real Bed,-97.74691,Apartment,1,2,104,f,4,f,f,30.25483,2.0,3,18,Entire home/apt,31,122,2,2.0,moderate
3,3,1.0,1125,f,Real Bed,-97.72036,Guesthouse,1,1,3,f,2,f,t,30.34519,0.0,3,3,Entire home/apt,3,3,0,1.0,flexible
4,506,2.0,1125,f,Real Bed,-97.73246,Bungalow,2,6,59,f,10,f,t,30.26133,4.0,40,130,Entire home/apt,587,49,4,6.0,flexible


In [36]:
path = 'temp/'

df[selected_features].to_csv(os.path.join(path,'test.csv'))
# df=df[:10000]

In [37]:
df.bed_type.value_counts()

Real Bed         229073
Futon               730
Pull-out Sofa       552
Airbed              398
Couch               197
Name: bed_type, dtype: int64

In [39]:
df.require_guest_profile_picture.value_counts()

f    226930
t      4020
Name: require_guest_profile_picture, dtype: int64

In [40]:
df.require_guest_phone_verification.value_counts()

f    225601
t      5349
Name: require_guest_phone_verification, dtype: int64

**drop the following features**

* is_business_travel_ready
* require_guest_profile_picture
* require_guest_phone_verification

* host_about_len
* notes_len
* interaction_len
* access_len # what is this?

let's call this 'elminations_dec24' for that lack of a better label


In [90]:
# 3. Arrange X features matrix & y target vector

target = ['price']

# features: original 39 (Q: how was this 39 determined?)



# sparse_features 6

# unusable_features 1

# duplicative location feaatures

# duplicative_location_features  10

eliminations_dec24 = ['is_business_travel_ready','require_guest_profile_picture','require_guest_phone_verification','host_about_len','notes_len','interaction_len',
'access_len','house_rules_len','beds_str','bedrooms_str']

#numeric_columns
# nonnumeric_columns

# df[nonnumeric_columns] = df[nonnumeric_columns].astype(str)

# selected_features = list(set(features) - set(sparse_features))
# selected_features = list(set(selected_features) - set(unusable_features))
# selected_features = list(set(selected_features) - set(duplicative_location_features))
selected_features = list(set(selected_features) - set(eliminations_dec24))

df = df[target + selected_features]

df.dropna(inplace=True)

In [93]:
df.columns

Index(['price', 'minimum_nights', 'transit_len', 'room_type', 'accommodates',
       'longitude', 'bedrooms', 'instant_bookable', 'beds', 'bathrooms',
       'maximum_nights', 'cancellation_policy', 'bed_type', 'latitude',
       'property_type'],
      dtype='object')

In [101]:
len(df.columns)

15

In [94]:
from time import time
st = time()
y = df[target]
X = df[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

pipeline_decision_tree = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='mean'), 
    DecisionTreeRegressor(random_state=42)
)

# Fit on train
pipeline_decision_tree.fit(X_train, y_train)

# Score on train, val
print('Train Accuracy', pipeline_decision_tree.score(X_train, y_train))
print('Test Accuracy', pipeline_decision_tree.score(X_test, y_pred),"does not seem right")

# Predict on test
y_pred = pipeline_decision_tree.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')
et = time()
print('ran in {:.2f} seconds'.format(et-st))

Train Accuracy 0.9999404436975107
Test Accuracy 0.48448725173098806 does not seem right
mae $131
ran in 8.83 seconds


In [95]:
from time import time
st = time()
y = df[target]
X = df[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

pipeline_random_forest_regressor = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='mean'), 
    RandomForestRegressor(random_state=42)
)

# Fit on train
pipeline_random_forest_regressor.fit(X_train, y_train)

# Score on train, val
print('Train Accuracy', pipeline_random_forest_regressor.score(X_train, y_train))
print('Test Accuracy', pipeline_random_forest_regressor.score(X_test, y_pred),"does not seem right")

# Predict on test
y_pred = pipeline_random_forest_regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')
et = time()
print('ran in {:.2f} seconds'.format(et-st))

Train Accuracy 0.942140989166887
Test Accuracy 0.7201920248469412 does not seem right
mae $107
ran in 290.92 seconds


In [96]:
pipeline_random_forest_regressor

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['room_type', 'instant_bookable',
                                     'cancellation_policy', 'bed_type',
                                     'property_type'],
                               use_cat_names=True)),
                ('simpleimputer', SimpleImputer()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [97]:
# Save random forest 
with open('random_forest_regressor_fewer_features.pkl','wb') as model_file:
    pickle.dump(pipeline_random_forest_regressor, model_file)

# # 1.31GB file size    

# Save random forest 
with open('decision_tree_model_fewer_features.pkl','wb') as model_file:
    pickle.dump(pipeline_decision_tree, model_file)

# 20.3MB file size    

In [98]:
# load model
st = time()
with open('random_forest_regressor_fewer_features.pkl','rb') as model_file:
    loaded_model_2 = pickle.load(model_file)

y_pred = loaded_model_2.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')
et = time()
print('ran in {:.2f} seconds'.format(et-st))


mae $107
ran in 10.50 seconds


mae $107
ran in 2971.03 seconds


In [99]:
# for testing
X_train.shape
X_train.head()

Unnamed: 0,minimum_nights,transit_len,room_type,accommodates,longitude,bedrooms,instant_bookable,beds,bathrooms,maximum_nights,cancellation_policy,bed_type,latitude,property_type
39931,1,110,Entire home/apt,5,-82.98802,2.0,f,2.0,1.0,1125,strict_14_with_grace_period,Real Bed,39.95069,Condominium
217369,30,287,Entire home/apt,3,-122.34084,1.0,t,1.0,1.0,1125,strict_14_with_grace_period,Real Bed,47.66101,Serviced apartment
153008,1,3,Entire home/apt,4,-73.94023,2.0,t,2.0,1.5,30,flexible,Real Bed,40.82495,Apartment
180362,3,63,Entire home/apt,10,-71.22206,5.0,f,5.0,2.5,1125,moderate,Real Bed,41.64919,House
86132,31,345,Entire home/apt,3,-118.26353,1.0,t,2.0,1.0,365,strict_14_with_grace_period,Real Bed,34.0309,Apartment


In [100]:

X_train.shape

(184760, 14)

In [103]:
path = 'temp/'
X_train[:100].to_csv(os.path.join(path,'X_train_100rows.csv'))


In [104]:
test = pd.read_csv(
        '/Users/jasimrashid/Projects/DS-Unit-4-Build-Week-4-Airbnb/temp/x_train_100rows.csv', index_col=False)
    

In [105]:
del test['Unnamed: 0']

In [115]:
X_train.columns

Index(['minimum_nights', 'transit_len', 'room_type', 'accommodates',
       'longitude', 'bedrooms', 'instant_bookable', 'beds', 'bathrooms',
       'maximum_nights', 'cancellation_policy', 'bed_type', 'latitude',
       'property_type'],
      dtype='object')

In [107]:
test

Unnamed: 0,minimum_nights,transit_len,room_type,accommodates,longitude,bedrooms,instant_bookable,beds,bathrooms,maximum_nights,cancellation_policy,bed_type,latitude,property_type
0,1,110,Entire home/apt,5,-82.98802,2.0,f,2.0,1.0,1125,strict_14_with_grace_period,Real Bed,39.95069,Condominium
1,30,287,Entire home/apt,3,-122.34084,1.0,t,1.0,1.0,1125,strict_14_with_grace_period,Real Bed,47.66101,Serviced apartment
2,1,3,Entire home/apt,4,-73.94023,2.0,t,2.0,1.5,30,flexible,Real Bed,40.82495,Apartment
3,3,63,Entire home/apt,10,-71.22206,5.0,f,5.0,2.5,1125,moderate,Real Bed,41.64919,House
4,31,345,Entire home/apt,3,-118.26353,1.0,t,2.0,1.0,365,strict_14_with_grace_period,Real Bed,34.03090,Apartment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3,754,Entire home/apt,6,-117.90609,2.0,t,3.0,1.0,28,strict_14_with_grace_period,Real Bed,33.98681,Condominium
96,1,430,Private room,2,-73.96032,1.0,t,1.0,1.0,30,moderate,Real Bed,40.70910,Apartment
97,1,3,Private room,1,-117.07125,1.0,t,0.0,1.5,1125,moderate,Real Bed,32.79082,House
98,3,182,Entire home/apt,10,-86.67337,3.0,t,7.0,2.0,7,strict_14_with_grace_period,Real Bed,36.16374,House


In [108]:
X_train.columns

Index(['minimum_nights', 'transit_len', 'room_type', 'accommodates',
       'longitude', 'bedrooms', 'instant_bookable', 'beds', 'bathrooms',
       'maximum_nights', 'cancellation_policy', 'bed_type', 'latitude',
       'property_type'],
      dtype='object')

In [None]:
# input form variables

    # longitude = request.form['longitude']
    # latitude = request.form['latitude']

    # minimum_nights = request.form['minimum_nights']
    # maximum_nights = int(request.form['maximum_nights'])
    # property_type = request.form['property_type']
    # room_type = request.form['room_type']
    # bathrooms = int(request.form['bathrooms'])
    # accommodates = request.form['accommodates']
    # bedrooms = int(request.form['bedrooms'])
    # beds = int(request.form['beds'])
    # bed_type = request.form['bed_type']
    # transit_len = int(request.form['transit_len'])
    # instant_bookable = request.form['instant_bookable']
    # cancellation_policy = request.form['cancellation_policy']

In [110]:
X_train[['latitude','longitude']]

Unnamed: 0,latitude,longitude
39931,39.95069,-82.98802
217369,47.66101,-122.34084
153008,40.82495,-73.94023
180362,41.64919,-71.22206
86132,34.03090,-118.26353
...,...,...
120860,40.79323,-73.97267
104426,34.06525,-118.39569
132975,40.71780,-73.95595
147923,40.76457,-73.97518


In [None]:
# Austin
latitude = 30.2672
longitude = 97.7431

In [117]:
df.columns

Index(['price', 'minimum_nights', 'transit_len', 'room_type', 'accommodates',
       'longitude', 'bedrooms', 'instant_bookable', 'beds', 'bathrooms',
       'maximum_nights', 'cancellation_policy', 'bed_type', 'latitude',
       'property_type'],
      dtype='object')

In [119]:
path = 'data/'
temp = pd.read_csv(os.path.join(path,'airbnb_listings_usa_cycle_1.csv'))

In [121]:
temp.metro_area.unique()

array(['Austin', 'Boston', 'Broward', 'Cambridge', 'Chicago',
       'Twin Cities', 'Clark CO', 'Columbus', 'Denver', 'Hawaii',
       'Jersey City', 'New York City', 'Los Angeles', 'Oakland',
       'Nashville', 'New Orleans', 'Santa Clara', 'Portland',
       'Rhode Island', 'Salem', 'San Diego', 'San Francisco', 'Seattle',
       'Washington DC'], dtype=object)