# Seattle Airbnb Machine Learning

Data exploration of Airbnb for London

In [33]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np

### Data Loading

In [34]:
listings_df = pd.read_csv("../data/listings.csv")
listings_df.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48


In [35]:
print(listings_df.shape)
listings_df.columns

(3818, 92)


Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

### Feature Engineering

In [36]:
clean_na_df =listings_df.dropna(subset=['review_scores_location', 'review_scores_value', 'review_scores_cleanliness', 'review_scores_communication', 'review_scores_checkin', 'review_scores_accuracy'])

print(listings_df.shape)
print(clean_na_df.shape)

(3818, 92)
(3158, 92)


In [37]:
# Get reviews data
hist_data = [clean_na_df['review_scores_location'].astype(float), clean_na_df['review_scores_value'].astype(float),
clean_na_df['review_scores_cleanliness'].astype(float), clean_na_df['review_scores_communication'].astype(float), 
clean_na_df['review_scores_checkin'].astype(float), clean_na_df['review_scores_accuracy'].astype(float)]

group_labels = ['review_scores_location', 'review_scores_value', 'review_scores_cleanliness',
    'review_scores_location', 'review_scores_value', 'review_scores_cleanliness']  # labels

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=1)

fig.update_layout(
    template='plotly_dark',
    autosize=False,
    width=800,
    height=500,
    xaxis=dict(
    title_text="Review Scores",
    tickmode="array",
    titlefont=dict(size=14)))

fig.show()

In [38]:
ml_df = listings_df[['neighbourhood_group_cleansed', 'host_is_superhost', 'room_type', 'bathrooms', 'bedrooms', 'beds',
'review_scores_rating', 'review_scores_cleanliness', 'review_scores_location', 'review_scores_value','reviews_per_month',
 'price']]

ml_df.head(2)

Unnamed: 0,neighbourhood_group_cleansed,host_is_superhost,room_type,bathrooms,bedrooms,beds,review_scores_rating,review_scores_cleanliness,review_scores_location,review_scores_value,reviews_per_month,price
0,Queen Anne,f,Entire home/apt,1.0,1.0,1.0,95.0,10.0,9.0,10.0,4.07,$85.00
1,Queen Anne,t,Entire home/apt,1.0,1.0,1.0,96.0,10.0,10.0,10.0,1.48,$150.00


### Data Cleaning

In [40]:
def clean_string(df, col_name, character_list):
    """ Clean string of defined characters.
    
    Returns: df with column cleaned """
    df[col_name] = df[col_name].astype(str)

    for char in character_list:
        df[col_name] = [x.replace(char,'') for x in df[col_name]]
    
    return df

def col_to_dtype(df, col_name, dtype):
    """Change df col to particular dtype.
    
    Returns: df with column changed"""
    df[col_name] = df[col_name].astype(dtype)

    return df

ml_df = clean_string(ml_df, 'price', ['$',','])
ml_df = col_to_dtype(ml_df, 'price', 'float')


In [41]:
# convert columns to numeric
for col in ['bathrooms', 'bedrooms', 'beds','review_scores_rating', 'review_scores_cleanliness',
       'review_scores_location', 'review_scores_value', 'price']:
       cleaned_ml_df = col_to_dtype(ml_df, col, float)

In [None]:
sns.pairplot(cleaned_ml_df)

### One hot encoding Categorical Variables

In [42]:
onehot_encoding = pd.get_dummies(cleaned_ml_df['room_type'])
cleaned_ml_df = cleaned_ml_df.join(onehot_encoding)

onehot_encoding = pd.get_dummies(cleaned_ml_df['host_is_superhost'])
cleaned_ml_df = cleaned_ml_df.join(onehot_encoding)

onehot_encoding = pd.get_dummies(cleaned_ml_df['neighbourhood_group_cleansed'])
cleaned_ml_df = cleaned_ml_df.join(onehot_encoding)

cleaned_ml_df.head(3)

Unnamed: 0,neighbourhood_group_cleansed,host_is_superhost,room_type,bathrooms,bedrooms,beds,review_scores_rating,review_scores_cleanliness,review_scores_location,review_scores_value,...,Interbay,Lake City,Magnolia,Northgate,Other neighborhoods,Queen Anne,Rainier Valley,Seward Park,University District,West Seattle
0,Queen Anne,f,Entire home/apt,1.0,1.0,1.0,95.0,10.0,9.0,10.0,...,0,0,0,0,0,1,0,0,0,0
1,Queen Anne,t,Entire home/apt,1.0,1.0,1.0,96.0,10.0,10.0,10.0,...,0,0,0,0,0,1,0,0,0,0
2,Queen Anne,f,Entire home/apt,4.5,5.0,7.0,97.0,10.0,10.0,10.0,...,0,0,0,0,0,1,0,0,0,0


In [43]:
cleaned_ml_df = cleaned_ml_df.dropna(how='any')
cleaned_ml_df.shape

(3142, 34)

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

X = test_df.drop([ 'neighbourhood_group_cleansed', 'host_is_superhost', 'room_type','price'], axis=1)
y = test_df['price']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.6, random_state=42)


reg_model = LinearRegression().fit(X_train, y_train)
reg_model.score(X_test, y_test)

0.5393201892444912