## What effect does a Airbnb host’s bio metadata have on it’s frequency of their rentals being booked?


In [17]:
import csv
import pandas as pd
import numpy as np

import seaborn as sns


In [18]:
# import data
calendar_all = pd.read_csv('./data/calendar.csv')
listings_all = pd.read_csv('./data/listings.csv')
sentiment_scores = pd.read_csv('./data/scores.csv')
# reviews = pd.read_csv('./data/reviews.csv')


In [19]:
# do a bit of cleaning

# remove cols that only contain (1) unique elem
listings = listings_all
for col in listings_all.columns:
    if len(listings_all[col].unique()) == 1:
        listings = listings.drop(col,axis=1)

In [20]:
listings.head()

list(listings.columns.values)


['id',
 'listing_url',
 'name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'smart_location',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 '

In [21]:
# Trimmed Frame
trimmed_listings = listings[['id','host_id', 'host_response_rate', 'host_total_listings_count',
                             'host_is_superhost', 'neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates',
                             'guests_included', 'bathrooms', 'bedrooms', 'beds', 'price',
                               'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
                               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
                               'review_scores_location', 'review_scores_value']]

In [22]:
# populate review NA's with average
pd.options.mode.chained_assignment = None 
trimmed_listings['review_scores_rating'] = trimmed_listings.review_scores_rating.fillna(trimmed_listings.review_scores_rating.median())
trimmed_listings['review_scores_accuracy'] = trimmed_listings.review_scores_accuracy.fillna(trimmed_listings.review_scores_accuracy.median())
trimmed_listings['review_scores_cleanliness'] = trimmed_listings.review_scores_cleanliness.fillna(trimmed_listings.review_scores_cleanliness.median())
trimmed_listings['review_scores_checkin'] = trimmed_listings.review_scores_checkin.fillna(trimmed_listings.review_scores_checkin.median())
trimmed_listings['review_scores_communication'] = trimmed_listings.review_scores_communication.fillna(trimmed_listings.review_scores_communication.median())
trimmed_listings['review_scores_location'] = trimmed_listings.review_scores_location.fillna(trimmed_listings.review_scores_location.median())
trimmed_listings['review_scores_value'] = trimmed_listings.review_scores_value.fillna(trimmed_listings.review_scores_value.median())


In [23]:
# Add sentiment scores column
# Used Microsoft Azure Sentiment Text Analysis API through Postman requests to identify the sentiment score 
#  of listing descriptions from 1 - 100 
trimmed_listings = pd.concat([trimmed_listings, sentiment_scores], axis=1)

In [24]:
trimmed_listings['price'] = trimmed_listings['price'].replace('[\$\,\.]', '', regex=True).astype(int) / 100

In [25]:
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].str.replace('%', '')
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].fillna('-1')
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].astype(int)
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].replace(-1, np.nan)
trimmed_listings['host_response_rate'] = trimmed_listings.host_response_rate.fillna(trimmed_listings.host_response_rate.mean())

In [26]:
trimmed_listings

Unnamed: 0,id,host_id,host_response_rate,host_total_listings_count,host_is_superhost,neighbourhood_cleansed,property_type,room_type,accommodates,guests_included,...,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,id.1,score
0,241032,956883,96.000000,3.0,f,West Queen Anne,Apartment,Entire home/apt,4,2,...,207,95.0,10.0,10.0,10.0,10.0,9.0,10.0,241032,0.978213
1,953595,5177328,98.000000,6.0,t,West Queen Anne,Apartment,Entire home/apt,4,1,...,43,96.0,10.0,10.0,10.0,10.0,10.0,10.0,953595,0.990736
2,3308979,16708587,67.000000,2.0,f,West Queen Anne,House,Entire home/apt,11,10,...,20,97.0,10.0,10.0,10.0,10.0,10.0,10.0,3308979,0.958904
3,7421966,9851441,94.886798,1.0,f,West Queen Anne,Apartment,Entire home/apt,3,1,...,0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,7421966,0.840133
4,278830,1452570,100.000000,2.0,f,West Queen Anne,House,Entire home/apt,6,6,...,38,92.0,9.0,9.0,10.0,10.0,9.0,9.0,278830,0.989110
5,5956968,326758,94.886798,1.0,f,West Queen Anne,House,Private room,2,1,...,17,95.0,10.0,10.0,10.0,10.0,10.0,10.0,5956968,0.975429
6,1909058,2497928,100.000000,1.0,t,West Queen Anne,House,Private room,2,1,...,58,99.0,10.0,10.0,10.0,10.0,10.0,10.0,1909058,0.919985
7,856550,4016632,100.000000,5.0,t,West Queen Anne,Cabin,Private room,2,1,...,173,97.0,10.0,10.0,10.0,10.0,9.0,10.0,856550,0.500000
8,4948745,2166277,94.886798,1.0,f,West Queen Anne,Apartment,Private room,2,1,...,8,97.0,10.0,9.0,10.0,9.0,10.0,10.0,4948745,0.819616
9,2493658,5177328,98.000000,6.0,t,West Queen Anne,Apartment,Entire home/apt,4,1,...,32,97.0,10.0,10.0,10.0,10.0,10.0,9.0,2493658,0.996935
