In [1]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math
import pylab
import scipy.stats as stats
import statsmodels.api as sm
#from textblob import TextBlob
%matplotlib inline

# import some additional libraries from sklearn
from sklearn.model_selection import train_test_split 
from sklearn import datasets, linear_model, metrics 
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error

# Documentation for SKLearn Linear Regression: 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [9]:
# SAN FRANCISCO and NYC DATA
data_sf = pd.read_csv("x+income+commute+sentiment+poverty_sf.csv", low_memory=False)
data_nyc = pd.read_csv("x+income+commute+sentiment+poverty.csv", low_memory=False)

# data_sf = pd.read_csv("ny_listings_full_clean.csv", low_memory=False)
# data_nyc = pd.read_csv("sf_listings_full_clean.csv", low_memory=False)

In [10]:
y_sf = np.log( data_sf['price'].copy() )
y_nyc = np.log( data_nyc['price'].copy())

In [7]:

# encode SF categorical variables
property_dummies = pd.get_dummies(data_sf['property_type'])
room_dummies = pd.get_dummies(data_sf['room_type'])
bed_dummies = pd.get_dummies(data_sf['bed_type'])
ratings_scores_dummies = pd.get_dummies(data_sf['review_scores_rating'])

# replace the old columns with our new one-hot encoded ones
data_sf = pd.concat((data_sf.drop([\
    'property_type', 'room_type', 'bed_type', 'review_scores_rating'], axis=1), \
    property_dummies.astype(int), \
    room_dummies.astype(int), bed_dummies.astype(int), ratings_scores_dummies.astype(int)), \
    axis=1)

# do same for NYC
zipcode_dummies = pd.get_dummies(data_nyc['zipcode'])
property_dummies = pd.get_dummies(data_nyc['property_type'])
room_dummies = pd.get_dummies(data_nyc['room_type'])
bed_dummies = pd.get_dummies(data_nyc['bed_type'])
ratings_scores_dummies = pd.get_dummies(data_nyc['review_scores_rating'])
# replace the old columns with our new one-hot encoded ones
data_nyc = pd.concat((data_nyc.drop(['zipcode',\
    'property_type', 'room_type', 'bed_type', 'review_scores_rating'], axis=1), \
    zipcode_dummies.astype(int),property_dummies.astype(int), \
    room_dummies.astype(int), bed_dummies.astype(int), ratings_scores_dummies.astype(int)), \
    axis=1)

# change "t" and "f" so it reads 1 and 0
data_sf[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']] = (data_sf[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']] == 't').astype(int)
data_nyc[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']] = (data_nyc[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']] == 't').astype(int)


KeyError: 'property_type'

In [3]:
# commute_time_mins, sentiment_score, poverty_percent

# INCLUDING the extra three columns
# data_sf = data_sf[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', \
#        'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit', 
#        'cleaning_fee', 'guests_included', 'number_of_reviews', 'Aparthotel',\
#        'Apartment', 'Bed and breakfast', 'Boutique hotel', 'Bungalow', 'Cabin',\
#        'Castle', 'Condominium', 'Cottage', 'Dome house', 'Earth house',\
#        'Guest suite', 'Guesthouse', 'Hostel', 'Hotel', 'House', 'Loft',\
#        'Other', 'Resort', 'Serviced apartment', 'Tiny house', 'Townhouse',\
#        'Villa', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',\
#        'Airbed', 'Couch', 'Futon', 'Real Bed', 'commute_time_mins','sentiment_score','poverty_percent']]

data_sf.drop(['host_since'], axis=1)
# Drop last three cols
data_nyc = data_nyc.drop(['host_since', 'avg_income', 'commute_time_mins', 'sentiment_score','poverty_percent'], axis=1)
# Retain last three cols
#data_nyc = data_nyc.drop(['host_since', 'avg_income'], axis=1)

# Uncomment this if testing on SF
# data_nyc = data_nyc[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', \
#        'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit', 
#        'cleaning_fee', 'guests_included', 'number_of_reviews', 'Aparthotel',\
#        'Apartment', 'Bed and breakfast', 'Boutique hotel', 'Bungalow', 'Cabin',\
#        'Castle', 'Condominium', 'Cottage', 'Dome house', 'Earth house',\
#        'Guest suite', 'Guesthouse', 'Hostel', 'Hotel', 'House', 'Loft',\
#        'Other', 'Resort', 'Serviced apartment', 'Tiny house', 'Townhouse',\
#        'Villa', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',\
#        'Airbed', 'Couch', 'Futon', 'Real Bed', 'commute_time_mins','sentiment_score','poverty_percent']]

# EXCLUDING those three columns (comment out if you want to include them)
# data_sf = data_sf[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', \
#        'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit', 
#        'cleaning_fee', 'guests_included', 'number_of_reviews', 'Aparthotel',\
#        'Apartment', 'Bed and breakfast', 'Boutique hotel', 'Bungalow', 'Cabin',\
#        'Castle', 'Condominium', 'Cottage', 'Dome house', 'Earth house',\
#        'Guest suite', 'Guesthouse', 'Hostel', 'Hotel', 'House', 'Loft',\
#        'Other', 'Resort', 'Serviced apartment', 'Tiny house', 'Townhouse',\
#        'Villa', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',\
#        'Airbed', 'Couch', 'Futon', 'Real Bed']]

# data_nyc = data_nyc[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', \
#        'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit', 
#        'cleaning_fee', 'guests_included', 'number_of_reviews', 'Aparthotel',\
#        'Apartment', 'Bed and breakfast', 'Boutique hotel', 'Bungalow', 'Cabin',\
#        'Castle', 'Condominium', 'Cottage', 'Dome house', 'Earth house',\
#        'Guest suite', 'Guesthouse', 'Hostel', 'Hotel', 'House', 'Loft',\
#        'Other', 'Resort', 'Serviced apartment', 'Tiny house', 'Townhouse',\
#        'Villa', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',\
#        'Airbed', 'Couch', 'Futon', 'Real Bed']]

print("Len of NYC is: ", len(data_nyc))
print("Len of SF is: ", len(data_sf))


Len of NYC is:  36760
Len of SF is:  6293


In [5]:
# Split into train and test (split off 30% for test set)
percent_test = 0.3
X_train_sf, X_test_sf, y_train_sf, y_test_sf = train_test_split(data_sf, y_sf, test_size=percent_test, 
                                                    random_state=1) 

# Split into train and test (split off 30% for test set)
percent_test = 0.3
X_train, X_test, y_train, y_test = train_test_split(data_nyc, y_nyc, test_size=percent_test, 
                                                    random_state=1) 

# create linear regression object
reg = linear_model.LinearRegression() 

# Train on NYC data
print("X_test len ", len(X_test.columns))
reg.fit(X_train, y_train)

# Predict on SF data
predictions_sf = reg.predict(X_test_sf)

# Predict on NYC data
predictions = reg.predict(X_test)

# NYC
print("Mean Square Error: ", mean_squared_error(y_test, predictions))
print("Mean Absolute Error: ", mean_absolute_error(y_test, predictions))
print("R^2 Error: ", r2_score(y_test, predictions))

# SF
print("Mean Square Error: ", mean_squared_error(y_test_sf, predictions_sf))
print("Mean Absolute Error: ", mean_absolute_error(y_test_sf, predictions_sf))
print("R^2 Error: ", r2_score(y_test_sf, predictions_sf))


X_test len  245


ValueError: could not convert string to float: '6 years'

In [None]:
# plot the residual error

# setting plot style
plt.style.use('fivethirtyeight')

# plot residiual errors in training data
plt.scatter(reg.predict(X_train), reg.predict(X_train) - y_train, color='green', s=10, label='Train data')

# plot residual errors in test data
plt.scatter(reg.predict(X_test), reg.predict(X_test) - y_test, color='blue', s=10, label='Test data')

# plot line for zero residual error
plt.hlines(y=0, xmin=0, xmax=10, linewidth=2)

# plot legend
plt.legend(loc='upper right')
plt.xlabel('prediction')
plt.ylabel('error')

# plot title
plt.title("Residual errors")

# function to show plot
plt.show()