### airbnb_predictions_1_linear (drop lat/long geographical features) ###

In [151]:

import pandas as pd
import os
import seaborn as sns
import pickle
from time import time
from sklearn.linear_model import LinearRegression
path = 'data/'
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 100)

df = pd.read_csv(os.path.join(path,'airbnb_listings_usa_cycle_1.csv'))

# df=df[:10000]

In [152]:
df.metro_area.unique()

array(['Austin', 'Boston', 'Broward', 'Cambridge', 'Chicago',
       'Twin Cities', 'Clark CO', 'Columbus', 'Denver', 'Hawaii',
       'Jersey City', 'New York City', 'Los Angeles', 'Oakland',
       'Nashville', 'New Orleans', 'Santa Clara', 'Portland',
       'Rhode Island', 'Salem', 'San Diego', 'San Francisco', 'Seattle',
       'Washington DC'], dtype=object)

In [153]:
# 3. Arrange X features matrix & y target vector

target = ['price']

features = ['host_response_time','host_response_rate','host_acceptance_rate',
'street','neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed','city','state','zipcode','market','smart_location','latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','amenities','square_feet','minimum_nights','maximum_nights','instant_bookable','is_business_travel_ready','cancellation_policy','require_guest_profile_picture','require_guest_phone_verification','notes_len','transit_len','access_len','interaction_len','house_rules_len','host_about_len','metro_area','bedrooms_str','beds_str']

# Wrangle and pre-process

# Removing sparse features from features (sparse features are <90% populated)
sparse_features = ['square_feet','neighbourhood_group_cleansed','host_response_rate','host_response_time','neighbourhood','host_acceptance_rate']

unusable_features = ['amenities']

# v2 Exlcude zipcode, latitude and longitude, bedrooms_str and beds_str
duplicative_location_features = ['street','neighbourhood','neighbourhood_cleansed','neighbourhood_group_cleansed','city','state','latitude','longitude','market','smart_location','zipcode','bedrooms_str','beds_str']

numeric_columns = df.dtypes[df.dtypes==int].index.tolist()
nonnumeric_columns = df.dtypes[df.dtypes==object].index.tolist()

df[nonnumeric_columns] = df[nonnumeric_columns].astype(str)

selected_features = list(set(features) - set(sparse_features))
selected_features = list(set(selected_features) - set(unusable_features))
selected_features = list(set(selected_features) - set(duplicative_location_features))

df = df[target + selected_features]

df.dropna(inplace=True)

In [154]:
y = df[target]
X = df[selected_features]
y.shape, X.shape


((230950, 1), (230950, 21))

In [155]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)



#### Using pipelines for linear regression

In [156]:
y = df[target]
X = df[selected_features]
y.shape, X.shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

In [157]:
# User pipelines for linear regression
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    LinearRegression()
)

In [158]:
pipeline.fit(X_train, y_train)

# print('Train Accuracy', pipeline.score(X_train, y_train))
# print('Test Accuracy', pipeline.score(X_test, y_pred),"does not seem right")

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'mae ${mae:,.0f}')

r2 = r2_score(y_test, y_pred)
print(f'r2 ${r2}')

mae $186
r2 $0.11699969254468001


In [161]:
# Save pipeline 
with open('linear_model_pipeline_v2.pkl','wb') as model_file:
    pickle.dump(pipeline, model_file)

In [162]:
# load model
with open('linear_model_pipeline_v2.pkl','rb') as model_file:
    loaded_model = pickle.load(model_file)

In [163]:
# Run prediction
loaded_model.predict(X_train.iloc[:2,:])

# array([[134.01877682],[194.9625043 ]])

array([[269.78101657],
       [182.17696372]])

In [174]:
# Export X_train
import numpy as np
X_train[:10].to_csv("X_train.csv",index=False)

In [173]:
X_train

Unnamed: 0,require_guest_profile_picture,minimum_nights,bathrooms,transit_len,maximum_nights,host_about_len,accommodates,property_type,room_type,interaction_len,notes_len,instant_bookable,bed_type,access_len,require_guest_phone_verification,is_business_travel_ready,cancellation_policy,bedrooms,metro_area,beds,house_rules_len
39931,f,1,1.0,110,1125,3,5,Condominium,Entire home/apt,120,3,f,Real Bed,171,f,f,strict_14_with_grace_period,2.0,Columbus,2.0,67
217369,f,30,1.0,287,1125,3,3,Serviced apartment,Entire home/apt,270,455,t,Real Bed,317,f,f,strict_14_with_grace_period,1.0,Seattle,1.0,376
153008,f,1,1.5,3,30,3,4,Apartment,Entire home/apt,3,3,t,Real Bed,3,f,f,flexible,2.0,New York City,2.0,3
180362,f,3,2.5,63,1125,316,10,House,Entire home/apt,383,127,f,Real Bed,132,f,f,moderate,5.0,Rhode Island,5.0,114
86132,f,31,1.0,345,365,3,3,Apartment,Entire home/apt,212,3,t,Real Bed,67,f,f,strict_14_with_grace_period,1.0,Los Angeles,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120860,f,5,1.0,3,60,25,2,Apartment,Entire home/apt,3,3,f,Real Bed,3,f,f,moderate,1.0,New York City,1.0,3
104426,f,31,1.0,204,1125,467,4,Apartment,Entire home/apt,122,3,f,Real Bed,3,f,f,strict_14_with_grace_period,1.0,Los Angeles,1.0,58
132975,f,4,1.0,567,10,376,2,Apartment,Entire home/apt,177,3,f,Real Bed,168,f,f,strict_14_with_grace_period,1.0,New York City,1.0,137
147923,f,30,1.0,172,1125,3,1,Apartment,Entire home/apt,3,40,f,Real Bed,12,f,f,flexible,1.0,New York City,0.0,3


#### Linear model + Kbest in a pipeline

In [121]:
# from sklearn.feature_selection import SelectKBest
# selector = SelectKBest(k=15) #going down from 107 features to 15???
# X_train_selected = selector.fit_transform(X_train, y_train)
# X_test_selected = selector.transform(X_test)

In [122]:
# # User pipelines for linear regression
# import category_encoders as ce
# from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LinearRegression
# from sklearn.impute import SimpleImputer
# from sklearn.feature_selection import SelectKBest
# selector = SelectKBest(k=15) #going down from 107 features to 15???

# pipeline = make_pipeline(
#     ce.OneHotEncoder(use_cat_names=True),
#     SimpleImputer(strategy='mean'),
#     LinearRegression()
# )

# X_train_selected = selector.fit_transform(X_train, y_train)
# pipeline.fit(X_train_selected,)

# # pipeline.fit(X_train, y_train)



In [123]:
# X_train_selected

In [165]:
X_train.shape

(184760, 21)

In [177]:
X_test = pd.DataFrame({'metro_area': ['New York City'], 'minimum_nights': ['1'], 'maximum_nights': ['1'], 'property_type': ['House'], 'room_type': ['Private room'], 'bathrooms': ['1'], 'accommodates': ['1'], 'bedrooms': ['1'], 'beds': ['1'], 'bed_type': ['Real Bed'], 'require_guest_profile_picture': ['t'], 'transit_len': ['500'], 'host_about_len': ['1000'], 'notes_len': ['500'], 'access_len': ['500'], 'house_rules_len': ['500'], 'interaction_len': ['350'], 'instant_bookable': ['t'], 'require_guest_phone_verification': ['t'], 'is_business_travel_ready': ['f'], 'cancellation_policy': ['moderate']})

In [178]:
loaded_model.predict(X_test)

array([[76257213.62279014]])

**OOPS**!

We're run into a problem. The latitude and longitude are the only  location attribute for our prediction model. However, these coordinates would not be meaningful to the user as they are not familiar with the locations they correspond to. We could solve this by 
>(a) translating the coordinates to their corresponding neighborhoods in the app. 
>(b) Or we could pick a different location variable. 

We'll pick (b) which we'll implement in a separate notebook.

#### References ####

Transforming zipcode: https://towardsdatascience.com/transforming-categorical-information-into-usable-measures-in-a-machine-learning-model-e2910bbb3fc7



Tangents

* R2 
* 

Drop down values and ranges for prediction form

In [None]:
['host_response_time',
'host_response_rate',
'host_acceptance_rate',
'street',
'neighbourhood',
'neighbourhood_cleansed',
'neighbourhood_group_cleansed','city','state','zipcode','market','smart_location','latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms','beds','bed_type','amenities','square_feet','minimum_nights','maximum_nights','instant_bookable','is_business_travel_ready','cancellation_policy','require_guest_profile_picture','require_guest_phone_verification','notes_len','transit_len','access_len','interaction_len','house_rules_len','host_about_len','metro_area','bedrooms_str','beds_str']

In [140]:
print("X.shape ",X.shape)

print("X.metro_area.unique() ",X.metro_area.unique()) #['Austin' 'Boston' 'Broward' 'Cambridge' 'Chicago' 'Twin Cities','Clark CO' 'Columbus' 'Denver' 'Hawaii' 'Jersey City' 'New York City','Los Angeles','Oakland' 'Nashville' 'New Orleans' 'Santa Clara', 'Portland' 'Rhode Island' 'Salem' 'San Diego' 'San Francisco' 'Seattle','Washington DC']
print("X.require_guest_profile_picture.unique() ",X.require_guest_profile_picture.unique())
# print(X.minimum_nights.value_counts()) #range -> 0 to 90
print("X.minimum_nights>=60",(X.minimum_nights>=60).sum())
print("X.minimum_nights>=90",(X.minimum_nights>=90).sum())
print("X.maximum_nights.quantile(.99) ",X.maximum_nights.quantile(.99))

print("X.property_type.unique() ", X.property_type.unique()) #['House' 'Apartment' 'Guesthouse' 'Bungalow' 'Condominium' 'Townhouse', 'Resort' 'Tiny house' 'Cottage' 'Guest suite' 'Camper/RV' 'Loft' 'Hostel','Farm stay' 'Chalet' 'Boutique hotel' 'Serviced apartment' 'Villa', 'Cabin' 'Houseboat' 'Bed and breakfast' 'Dome house' 'Aparthotel','Campsite' 'Yurt' 'Tent' 'Boat' 'Hotel','Treehouse' 'Earth house' 'Other', 'Castle' 'Barn' 'Casa particular (Cuba)' 'Island' 'Hut' 'Timeshare','Lighthouse' 'Cave' 'Nature lodge' 'Bus' 'Igloo' 'Vacation home' 'Tipi','Train' 'Minsu (Taiwan)' 'Dorm' 'Pension (South Korea)' 'In-law']
print("X.room_type.unique() ", X.room_type.unique()) #['Private room' 'Entire home/apt' 'Shared room' 'Hotel room']

print("(X.bathrooms>=5) ",(X.bathrooms>=5).sum()) # 0 to 8
print("X.accommodates.quantile(.99) ",X.accommodates.quantile(.99)) #0 to 15
print("X.bedrooms.quantile(.99) ", X.bedrooms.quantile(.99)) # 0 to 5
print("X.beds.quantile(.99) ",X.beds.quantile(.99)) # 0 to 9
print("X.bed_type.unique() ",X.bed_type.unique()) #['Real Bed' 'Airbed' 'Futon' 'Pull-out Sofa' 'Couch']
print("X.beds_str.unique() ",X.beds_str.unique()) #['1' '2' '6' '3' '4' '7' '5' '8' '0' '10+' '9']
print("X.bedrooms_str.unique() ",X.bedrooms_str.unique()) #['1' '2' '0' '4' '3' '5' '6' '7+']

print("X.transit_len.quantile(.99)) ", X.transit_len.quantile(.99)) #0 to 1000
print("X.host_about_len.quantile(.995) ",X.host_about_len.quantile(.995)) #0 to 2000
print("X.notes_len.quantile(.99) ",X.notes_len.quantile(.99)) #0 to 1000
print("X.access_len.quantile(.99) ", X.access_len.quantile(.99)) #0 to 1000
print("X.house_rules_len.quantile(.99) ",X.house_rules_len.quantile(.99)) # 0 to 1000
print("X.interaction_len.quantile(.99) ",X.interaction_len.quantile(.99)) #0 to 700

print("X.instant_bookable.unique() ",X.instant_bookable.unique()) #['f','t']
print("X.require_guest_phone_verification.unique() ",X.require_guest_phone_verification.unique()) #['f' 't']
print("X.is_business_travel_ready.unique() ",X.is_business_travel_ready.unique()) #['f']
print("X.cancellation_policy.unique() ",X.cancellation_policy.unique()) #'moderate' 'flexible' 'strict_14_with_grace_period' 'super_strict_30','super_strict_60' 'luxury_moderate' 'strict' 'luxury_super_strict_95','luxury_no_refund' 'luxury_super_strict_125']




X.shape  (230950, 23)
X.metro_area.unique()  ['Austin' 'Boston' 'Broward' 'Cambridge' 'Chicago' 'Twin Cities'
 'Clark CO' 'Columbus' 'Denver' 'Hawaii' 'Jersey City' 'New York City'
 'Los Angeles' 'Oakland' 'Nashville' 'New Orleans' 'Santa Clara'
 'Portland' 'Rhode Island' 'Salem' 'San Diego' 'San Francisco' 'Seattle'
 'Washington DC']
X.require_guest_profile_picture.unique()  ['f' 't']
X.minimum_nights>=60 3711
X.minimum_nights>=90 2952
X.maximum_nights.quantile(.99)  1125.0
X.property_type.unique()  ['House' 'Apartment' 'Guesthouse' 'Bungalow' 'Condominium' 'Townhouse'
 'Resort' 'Tiny house' 'Cottage' 'Guest suite' 'Camper/RV' 'Loft' 'Hostel'
 'Farm stay' 'Chalet' 'Boutique hotel' 'Serviced apartment' 'Villa'
 'Cabin' 'Houseboat' 'Bed and breakfast' 'Dome house' 'Aparthotel'
 'Campsite' 'Yurt' 'Tent' 'Boat' 'Hotel' 'Treehouse' 'Earth house' 'Other'
 'Castle' 'Barn' 'Casa particular (Cuba)' 'Island' 'Hut' 'Timeshare'
 'Lighthouse' 'Cave' 'Nature lodge' 'Bus' 'Igloo' 'Vacation home' 'Ti

In [64]:
X.head()

Unnamed: 0,require_guest_profile_picture,minimum_nights,bathrooms,transit_len,maximum_nights,host_about_len,accommodates,property_type,room_type,interaction_len,notes_len,instant_bookable,bed_type,access_len,beds_str,require_guest_phone_verification,is_business_travel_ready,bedrooms_str,cancellation_policy,bedrooms,metro_area,beds,house_rules_len
0,f,1,1.0,378,7,3,2,House,Private room,110,3,f,Real Bed,186,1,f,f,1,moderate,1.0,Austin,1.0,3
1,f,1,2.0,3,14,3,8,House,Entire home/apt,3,44,f,Real Bed,3,2,f,f,2,moderate,2.0,Austin,2.0,211
2,f,1,1.0,31,7,104,4,Apartment,Entire home/apt,18,3,f,Real Bed,122,2,f,f,2,moderate,2.0,Austin,2.0,3
3,f,1,1.0,3,1125,3,2,Guesthouse,Entire home/apt,3,3,t,Real Bed,3,1,f,f,0,flexible,0.0,Austin,1.0,3
4,f,2,2.0,587,1125,59,10,Bungalow,Entire home/apt,130,40,t,Real Bed,49,6,f,f,4,flexible,4.0,Austin,6.0,506


In [166]:
X_train.columns

Index(['require_guest_profile_picture', 'minimum_nights', 'bathrooms',
       'transit_len', 'maximum_nights', 'host_about_len', 'accommodates',
       'property_type', 'room_type', 'interaction_len', 'notes_len',
       'instant_bookable', 'bed_type', 'access_len',
       'require_guest_phone_verification', 'is_business_travel_ready',
       'cancellation_policy', 'bedrooms', 'metro_area', 'beds',
       'house_rules_len'],
      dtype='object')

In [148]:
X_train.bedrooms_str.value_counts()

1     101546
2      37642
3      17224
0      17143
4       7180
5       2514
6        834
7+       677
Name: bedrooms_str, dtype: int64

In [149]:
X_train.beds_str.value_counts()

1      83298
2      45899
3      22754
4      13646
5       5881
0       5093
6       3344
7       1599
10+     1467
8       1191
9        588
Name: beds_str, dtype: int64