In [34]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pandas_profiling
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#Get the latest listings.csv.gz from http://insideairbnb.com/get-the-data.html
#Then convert it to pandas dataframe

df_london = pd.read_csv("London Airbnb Listing Detailed.csv")
df_london.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,11551.0,https://www.airbnb.com/rooms/11551,20190910000000.0,2019-09-15,Arty and Bright London Apartment in Zone 2,Unlike most rental apartments out there my fla...,"Amenities Bedding: 1 Double bed, 1 living room...",Unlike most rental apartments out there my fla...,family,Not even 10 minutes by metro from Victoria Sta...,...,t,f,strict_14_with_grace_period,f,t,2.0,2.0,0.0,0.0,1.59
1,13913.0,https://www.airbnb.com/rooms/13913,20190910000000.0,2019-09-15,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,"Hello Everyone, I'm offering my lovely double ...",My bright double bedroom with a large window h...,business,Finsbury Park is a friendly melting pot commun...,...,f,f,moderate,f,f,3.0,1.0,2.0,0.0,0.14
2,15400.0,https://www.airbnb.com/rooms/15400,20190910000000.0,2019-09-15,Bright Chelsea Apartment. Chelsea!,Lots of windows and light. St Luke's Gardens ...,Bright Chelsea Apartment This is a bright one...,Lots of windows and light. St Luke's Gardens ...,romantic,It is Chelsea.,...,t,f,strict_14_with_grace_period,t,t,1.0,1.0,0.0,0.0,0.73


In [3]:
df_london.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85274 entries, 0 to 85273
Columns: 106 entries, id to reviews_per_month
dtypes: float64(35), object(71)
memory usage: 69.0+ MB


In [None]:
#Run a profiling report for initial EDA

# london_report = df_london.profile_report(title="London Airbnb Listing Detailed Profiling Report")
# london_report.to_file(output_file="londondetailed.html")

In [None]:
#london_report

In [4]:
#Delete highly correlated or similar columns and those with only null values based on the Profiling Report 
#Delete columns which will not be used modeling

columns_to_delete = ["calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms", 
                     "host_acceptance_rate", "host_listings_count", "last_scraped", "maximum_maximum_nights", "medium_url", "minimum_maximum_nights", "host_neighbourhood",
                     "minimum_minimum_nights", "neighbourhood_group_cleansed", "scrape_id", "square_feet", "thumbnail_url", "xl_picture_url", "listing_url", "name",
                     "summary", "space", "description", "neighborhood_overview", "notes", "transit", "access", "interaction", "house_rules", "picture_url", 
                     "host_id", "host_url", "host_name", "host_location", "host_about", "host_response_time", "host_thumbnail_url", "host_picture_url", "street", "state",
                     "zipcode", "market", "smart_location", "country_code", "country", "calendar_updated", "has_availability", "license", 
                     "jurisdiction_names", "maximum_minimum_nights", "availability_60", "availability_90", "availability_30"]

df_londonairbnb = df_london.drop(columns_to_delete, axis = 1)
df_londonairbnb.dropna(subset = ["id"], axis = 0, inplace = True) #all disctint listings except for one null value
df_londonairbnb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85273 entries, 0 to 85273
Data columns (total 55 columns):
id                                  85273 non-null float64
experiences_offered                 85273 non-null object
host_since                          85257 non-null object
host_response_rate                  60451 non-null object
host_is_superhost                   85256 non-null object
host_total_listings_count           85256 non-null object
host_verifications                  85272 non-null object
host_has_profile_pic                85256 non-null object
host_identity_verified              85256 non-null object
neighbourhood                       85272 non-null object
neighbourhood_cleansed              85272 non-null object
city                                85094 non-null object
latitude                            85272 non-null float64
longitude                           85272 non-null float64
is_location_exact                   85272 non-null object
property_type     

In [None]:
#Run another profiling report after deleting columns

# london_report2 = df_londonairbnb.profile_report(title="London Airbnb Listing Detailed Profiling Report (2)")
# london_report2.to_file(output_file="londondetailed2.html")

In [None]:
#london_report2

In [5]:
df_londonairbnb.select_dtypes("float64").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85273 entries, 0 to 85273
Data columns (total 22 columns):
id                                85273 non-null float64
latitude                          85272 non-null float64
longitude                         85272 non-null float64
accommodates                      85272 non-null float64
bathrooms                         85145 non-null float64
bedrooms                          85230 non-null float64
beds                              85119 non-null float64
guests_included                   85272 non-null float64
minimum_nights                    85272 non-null float64
maximum_nights                    85272 non-null float64
availability_365                  85272 non-null float64
number_of_reviews                 85272 non-null float64
number_of_reviews_ltm             85272 non-null float64
review_scores_rating              62590 non-null float64
review_scores_accuracy            62539 non-null float64
review_scores_cleanliness         62

In [6]:
#Delete rows where column has only one nan value

nan_to_delete = ["latitude", "longitude", "guests_included", "minimum_nights", "maximum_nights", "availability_365", "calculated_host_listings_count"]

df_londonairbnb.dropna(subset = nan_to_delete, axis = 0, inplace = True)
df_londonairbnb.shape

(85272, 55)

In [7]:
#Deal with null and zero values of column dtype = float

df_londonairbnb.select_dtypes("float64").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85272 entries, 0 to 85273
Data columns (total 22 columns):
id                                85272 non-null float64
latitude                          85272 non-null float64
longitude                         85272 non-null float64
accommodates                      85272 non-null float64
bathrooms                         85145 non-null float64
bedrooms                          85230 non-null float64
beds                              85119 non-null float64
guests_included                   85272 non-null float64
minimum_nights                    85272 non-null float64
maximum_nights                    85272 non-null float64
availability_365                  85272 non-null float64
number_of_reviews                 85272 non-null float64
number_of_reviews_ltm             85272 non-null float64
review_scores_rating              62590 non-null float64
review_scores_accuracy            62539 non-null float64
review_scores_cleanliness         62

In [8]:
def check_zero_nan(col, replace = "mean"):
    print("column name = ", col.name)
    print("null values = ", col.isna().sum())
    print("zeroes = ", col[col == 0].count())
    display(col.value_counts().sort_values(ascending = False).head(10))
    test = col.copy()
    pre = test.mean()
    print("mean (pre) = " , pre)
    if replace == "mean":
        test.fillna(value = pre, inplace = True)
        test.map(lambda x: pre if x == 0 else x)
        post = test.mean()
        print("after replacing nan and 0 using the average")
        print("mean (post) = ", test.mean())
        print("mean diff = ", round(pre-post, 3), "\n")
    elif replace == "mode": 
        test.fillna(value = test.mode(), inplace = True)
        test.map(lambda x: test.mode() if x == 0 else x)
        post = test.mean()
        print("after replacing nan and 0 using the mode")
        print("mean (post) = ", test.mean())
        print("mean diff = ", round(pre-post, 3), "\n")

In [None]:
#Check which proxy to be used: mean vs mode

zeroes_nan_columns = ["bathrooms", "bedrooms", "beds", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_communication",
                      "review_scores_checkin", "review_scores_location", "review_scores_value", "reviews_per_month"]

for col in zeroes_nan_columns:
    check_zero_nan(df_londonairbnb[col], replace = "mean")
    check_zero_nan(df_londonairbnb[col], replace = "mode")

In [9]:
# zeroes_nan_columns = ["bathrooms", "bedrooms", "beds", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_communication",
#                       "review_scores_checkin", "review_scores_location", "review_scores_value", "reviews_per_month"]

#bathrooms proxy 1 (mode) for null and zeroes, 
#bedrooms proxy 0 for null and zeroes, assume studio type
#bed proxy 1 (mode) for null and zeroes
#reviews proxy 0 for null

df_londonairbnb.bathrooms = df_londonairbnb.bathrooms.map(lambda x: 1 if (x == 0) else x)
df_londonairbnb.bathrooms.fillna(value = 1, inplace = True)
df_londonairbnb.bedrooms.fillna(value = 0, inplace = True)
df_londonairbnb.beds = df_londonairbnb.beds.map(lambda x: 1 if (x == 0) else x)
df_londonairbnb.beds.fillna(value = 1, inplace = True)
df_londonairbnb.review_scores_rating.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_accuracy.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_cleanliness.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_communication.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_checkin.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_location.fillna(value = 0, inplace = True)
df_londonairbnb.review_scores_value.fillna(value = 0, inplace = True)
df_londonairbnb.reviews_per_month.fillna(value = 0, inplace = True)

In [10]:
#Check if all float columns have been dealt with

df_londonairbnb.select_dtypes("float64").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85272 entries, 0 to 85273
Data columns (total 22 columns):
id                                85272 non-null float64
latitude                          85272 non-null float64
longitude                         85272 non-null float64
accommodates                      85272 non-null float64
bathrooms                         85272 non-null float64
bedrooms                          85272 non-null float64
beds                              85272 non-null float64
guests_included                   85272 non-null float64
minimum_nights                    85272 non-null float64
maximum_nights                    85272 non-null float64
availability_365                  85272 non-null float64
number_of_reviews                 85272 non-null float64
number_of_reviews_ltm             85272 non-null float64
review_scores_rating              85272 non-null float64
review_scores_accuracy            85272 non-null float64
review_scores_cleanliness         85

In [11]:
#Deal with null and zero values of column dtype = object

df_londonairbnb.select_dtypes("object").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85272 entries, 0 to 85273
Data columns (total 33 columns):
experiences_offered                 85272 non-null object
host_since                          85256 non-null object
host_response_rate                  60451 non-null object
host_is_superhost                   85256 non-null object
host_total_listings_count           85256 non-null object
host_verifications                  85272 non-null object
host_has_profile_pic                85256 non-null object
host_identity_verified              85256 non-null object
neighbourhood                       85272 non-null object
neighbourhood_cleansed              85272 non-null object
city                                85094 non-null object
is_location_exact                   85272 non-null object
property_type                       85272 non-null object
room_type                           85272 non-null object
bed_type                            85272 non-null object
amenities            

In [12]:
#Delete rows where column has "few" nan values

nan_to_delete = ["host_since", "host_is_superhost", "host_total_listings_count", "host_has_profile_pic", "host_identity_verified"]

df_londonairbnb.dropna(subset = nan_to_delete, axis = 0, inplace = True)
df_londonairbnb.select_dtypes("object").isna().sum()

experiences_offered                     0
host_since                              0
host_response_rate                  24805
host_is_superhost                       0
host_total_listings_count               0
host_verifications                      0
host_has_profile_pic                    0
host_identity_verified                  0
neighbourhood                           0
neighbourhood_cleansed                  0
city                                  178
is_location_exact                       0
property_type                           0
room_type                               0
bed_type                                0
amenities                               0
price                                   0
weekly_price                        78070
monthly_price                       80131
security_deposit                    29640
cleaning_fee                        21520
extra_people                            0
minimum_nights_avg_ntm                  0
maximum_nights_avg_ntm            

In [None]:
for col in list(df_londonairbnb.select_dtypes("object").columns):
    print(col)
    display(df_londonairbnb[col].value_counts().sort_values(ascending = False).head(10))
    print("null values = ", df_londonairbnb[col].isna().sum())
    print("\n")

In [13]:
df_londonairbnb.host_is_superhost = df_londonairbnb.host_is_superhost.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.host_has_profile_pic = df_londonairbnb.host_has_profile_pic.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.host_identity_verified = df_londonairbnb.host_identity_verified.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.is_location_exact = df_londonairbnb.is_location_exact.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.require_guest_phone_verification = df_londonairbnb.require_guest_phone_verification.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.require_guest_profile_picture = df_londonairbnb.require_guest_profile_picture.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.drop(columns = ["is_business_travel_ready"], axis = 1, inplace = True)
df_londonairbnb.instant_bookable = df_londonairbnb.instant_bookable.map(lambda x: 1 if x == "t" else 0).astype("int64")
df_londonairbnb.requires_license = df_londonairbnb.requires_license.map(lambda x: 1 if x == "t" else 0).astype("int64")

In [14]:
df_londonairbnb.host_response_rate.fillna(value = "0%", inplace = True)
df_londonairbnb.host_response_rate = df_londonairbnb.host_response_rate.map(lambda x: str(x).replace("%", "")).astype("int64")

df_londonairbnb.price = df_londonairbnb.price.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.price = df_londonairbnb.price.astype("float64")

df_londonairbnb.security_deposit.fillna(value = "$0", inplace = True)
df_londonairbnb.security_deposit = df_londonairbnb.security_deposit.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.security_deposit = df_londonairbnb.security_deposit.astype("float64")

df_londonairbnb.cleaning_fee.fillna(value = "$0", inplace = True)
df_londonairbnb.cleaning_fee = df_londonairbnb.cleaning_fee.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.cleaning_fee = df_londonairbnb.cleaning_fee.astype("float64")

df_londonairbnb.extra_people.fillna(value = "$0", inplace = True)
df_londonairbnb.extra_people = df_londonairbnb.extra_people.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.extra_people = df_londonairbnb.extra_people.astype("float64")

df_londonairbnb.weekly_price.fillna(value = "$0", inplace = True)
df_londonairbnb.weekly_price = df_londonairbnb.weekly_price.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.weekly_price = df_londonairbnb.weekly_price.astype("float64")

df_londonairbnb.monthly_price.fillna(value = "$0", inplace = True)
df_londonairbnb.monthly_price = df_londonairbnb.monthly_price.map(lambda x: str(x).replace("$", "").replace(".00", "").replace(",", ""))
df_londonairbnb.monthly_price = df_londonairbnb.monthly_price.astype("float64")

df_londonairbnb.host_total_listings_count = df_londonairbnb.host_total_listings_count.map(lambda x: str(x).replace(".0", "").replace(",", ""))
df_londonairbnb.host_total_listings_count = df_londonairbnb.host_total_listings_count.astype("int64")

In [37]:
df_londonairbnb['host_days'] = pd.to_datetime(df_londonairbnb['calendar_last_scraped']) - pd.to_datetime(df_londonairbnb['host_since'])
df_londonairbnb["days_from_last_review"] = pd.to_datetime(df_londonairbnb['calendar_last_scraped']) - pd.to_datetime(df_london['last_review'])

df_londonairbnb['host_days'] = df_londonairbnb['host_days']/ np.timedelta64(1, 'D')
df_londonairbnb["days_from_last_review"] = df_londonairbnb["days_from_last_review"] / np.timedelta64(1, 'D')
df_londonairbnb["days_from_last_review"].fillna(value = 0, inplace = True)

In [38]:
df_londonairbnb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85256 entries, 0 to 85273
Data columns (total 56 columns):
id                                  85256 non-null float64
experiences_offered                 85256 non-null object
host_since                          85256 non-null object
host_response_rate                  85256 non-null int64
host_is_superhost                   85256 non-null int64
host_total_listings_count           85256 non-null int64
host_verifications                  85256 non-null object
host_has_profile_pic                85256 non-null int64
host_identity_verified              85256 non-null int64
neighbourhood                       85256 non-null object
neighbourhood_cleansed              85256 non-null object
city                                85078 non-null object
latitude                            85256 non-null float64
longitude                           85256 non-null float64
is_location_exact                   85256 non-null int64
property_type           

In [16]:
def preprocess(X, y):
    '''Takes in features and target and implements all preprocessing steps for categorical and continuous features returning 
    train and test dataframes with targets'''
    
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, train_size = 0.7)
    
    # remove "object"-type features and SalesPrice from `X`
    X_train_cf = X_train[[column for column in X.columns if X[column].dtype != "object"]]
    X_test_cf = X_test[[column for column in X.columns if X[column].dtype != "object"]]
    
    # Scale the train and test data
    stdscaler = StandardScaler()
    stdscaler.fit(X_train_cf)
    
    X_train_scaled = pd.DataFrame(data = stdscaler.transform(X_train_cf), columns = X_train_cf.columns)
    X_test_scaled = pd.DataFrame(data = stdscaler.transform(X_test_cf), columns = X_test_cf.columns)
    
    # Create X_cat which contains only the categorical variables
    X_train_cat = X_train[[column for column in X.columns if X[column].dtype == "object"]]
    X_test_cat = X_test[[column for column in X.columns if X[column].dtype == "object"]]
    

    #Fill nans with a value indicating that that it is missing
#     X_train_cat.fillna(value = "NULL", inplace = True)
#     X_test_cat.fillna(value = "NULL", inplace = True)

    # OneHotEncode Categorical variables
    enc = OneHotEncoder(handle_unknown='ignore', dtype = "int64")
    enc.fit(X_train_cat)
    X_train_enc = enc.transform(X_train_cat)
    X_test_enc = enc.transform(X_test_cat)
    columns = enc.get_feature_names(input_features=X_train_cat.columns)
    X_train_enc = pd.DataFrame(X_train_enc.todense(), columns=columns)
    X_test_enc = pd.DataFrame(X_test_enc.todense(), columns=columns)
    
    # combine categorical and continuous features into the final dataframe
    X_train_all = pd.concat([X_train_scaled, X_train_enc], axis = 1)
    X_test_all = pd.concat([X_test_scaled, X_test_enc], axis = 1)
    
    return X_train_all, X_test_all, y_train, y_test

In [None]:
df_londonairbnb.columns

In [78]:
X = df_londonairbnb[['experiences_offered', 'host_response_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'number_of_reviews', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'requires_license', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'calculated_host_listings_count',
       'reviews_per_month', 'host_days', 'days_from_last_review']]

y = df_londonairbnb[["price"]]

df_all = df_londonairbnb[['price','experiences_offered', 'host_response_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'number_of_reviews', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'requires_license', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'calculated_host_listings_count',
       'reviews_per_month', 'host_days', 'days_from_last_review']]

In [None]:
# categorical=['experiences_offered','neighbourhood_cleansed','property_type','room_type', 'bed_type', 'cancellation_policy']
# cat_df = df_londonairbnb[categorical]
# cat_df = pd.get_dummies(cat_df)
# cat_df

In [82]:
X_train, X_test, y_train, y_test = preprocess(X, y)
linreg = LinearRegression()
linreg.fit(X_train, y_train)

print('Training r^2:', linreg.score(X_train, y_train))
print('Testing r^2:', linreg.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, linreg.predict(X_test)))

Training r^2: 0.06277101449466715
Testing r^2: 0.07056649983296293
Training MSE: 188942.18440772835
Testing MSE: 174557.6513772832


In [75]:
df_londonairbnb.corr().unstack().sort_values(ascending = False).drop_duplicates().head(40)

days_from_last_review             days_from_last_review             1.000000
review_scores_checkin             review_scores_communication       0.989415
review_scores_rating              review_scores_accuracy            0.989381
review_scores_value               review_scores_rating              0.988025
review_scores_rating              review_scores_communication       0.986667
review_scores_communication       review_scores_accuracy            0.986377
review_scores_accuracy            review_scores_value               0.986191
review_scores_cleanliness         review_scores_rating              0.985922
review_scores_accuracy            review_scores_checkin             0.984809
review_scores_value               review_scores_communication       0.983659
review_scores_checkin             review_scores_rating              0.983085
                                  review_scores_value               0.982960
review_scores_cleanliness         review_scores_accuracy            0.982585

In [70]:
df_londonairbnb.corr().unstack().sort_values(ascending = False).drop_duplicates().tail(2)

host_response_rate  days_from_last_review   -0.409722
id                  host_days               -0.509170
dtype: float64

In [76]:
df_londonairbnb.columns

Index(['id', 'experiences_offered', 'host_since', 'host_response_rate',
       'host_is_superhost', 'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'city', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_365',
       'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm',
       'first_review', 'last_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'requir

In [81]:
df_all.corr()["price"].sort_values(ascending = False)

price                               1.000000
accommodates                        0.175490
bedrooms                            0.168826
cleaning_fee                        0.162196
guests_included                     0.149319
bathrooms                           0.145006
beds                                0.140552
security_deposit                    0.098222
host_total_listings_count           0.057752
calculated_host_listings_count      0.055434
availability_365                    0.044718
host_response_rate                  0.034877
instant_bookable                    0.033189
minimum_nights                      0.018846
extra_people                        0.014482
requires_license                    0.003047
host_has_profile_pic                0.000360
require_guest_phone_verification   -0.000785
maximum_nights                     -0.000785
require_guest_profile_picture      -0.007592
host_is_superhost                  -0.012426
is_location_exact                  -0.013477
host_ident

In [88]:
X_train, X_test, y_train, y_test = preprocess(X, y)

In [93]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 8)
lasso.fit(X_train, y_train)

print('Training r^2:', lasso.score(X_train, y_train))
print('Testing r^2:', lasso.score(X_test, y_test))
print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train)))
print('Testing MSE:', mean_squared_error(y_test, lasso.predict(X_test)))

Training r^2: 0.04598783245844529
Testing r^2: 0.057399034483212374
Training MSE: 192325.61697786654
Testing MSE: 177030.64360925154


In [95]:
train_mse = []
test_mse = []
alphas = []
train_R2 = []
test_R2 = []

for alpha in np.linspace(0, 40, num=50):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    
    train_preds = lasso.predict(X_train)
    train_R2.append(lasso.score(X_train, y_train))
    train_mse.append(mean_squared_error(y_train, train_preds))
    
    test_preds = lasso.predict(X_test)
    test_R2.append(lasso.score(X_test, y_test))
    test_mse.append(mean_squared_error(y_test, test_preds))
    
    alphas.append(alpha)

In [99]:
df_alpha = pd.DataFrame({"alpha":alphas, "Training_r^2": train_R2, "MSE_train": train_mse, "Testing_r^2": test_R2, "MSE_test": test_mse})
df_alpha.sort_values(by="Training_r^2", ascending = False)

Unnamed: 0,alpha,Training_r^2,MSE_train,Testing_r^2,MSE_test
0,0.0,0.062771,188942.184409,0.070567,174557.592902
1,0.816327,0.056509,190204.605039,0.067255,175179.661949
2,1.632653,0.055293,190449.657961,0.066155,175386.1709
3,2.44898,0.054071,190696.000182,0.064883,175625.113629
4,3.265306,0.052777,190956.88473,0.063519,175881.205798
5,4.081633,0.051398,191234.922929,0.062231,176123.205482
6,4.897959,0.049919,191533.076625,0.060865,176379.642879
7,5.714286,0.048912,191736.01498,0.060023,176537.839346
8,6.530612,0.047958,191928.468728,0.05919,176694.326399
9,7.346939,0.046876,192146.511816,0.058222,176876.166814


In [102]:
df_londonairbnb.to_csv("londonairbnb_cleaned.csv", index = False)