## Data clean / exploration

In [266]:
import csv
import pandas as pd
import numpy as np

In [267]:
# import data
calendar_all = pd.read_csv('./data/calendar.csv')
listings_all = pd.read_csv('./data/listings.csv')
bio_sentiment_scores = pd.read_csv('./data/bio-scores.csv')
desc_sentiment_scores = pd.read_csv('./data/desc-scores.csv')

# don't need reviews atm
# reviews = pd.read_csv('./data/reviews.csv')

In [268]:
# do a bit of cleaning

# remove cols that only contain (1) unique elem
listings = listings_all
for col in listings_all.columns:
    if len(listings_all[col].unique()) == 1:
        listings = listings.drop(col,axis=1)

In [269]:
# view all features
list(listings.columns.values)

# convert host_is_superhost to binary for easy dummy variable encoding
listings.host_is_superhost = listings['host_is_superhost'].replace({'f': 0, 't': 1})

In [270]:
# get selection of features that might be good predictors

# Trimmed Frame
trimmed_listings = listings[['id','host_id', 'host_response_rate',
                             'host_is_superhost', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'accommodates',
                             'guests_included', 'bathrooms', 'bedrooms', 'beds', 'price',
                               'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
                               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
                               'review_scores_location', 'review_scores_value']]

In [271]:
# populate review NA's with average
pd.options.mode.chained_assignment = None 
trimmed_listings['review_scores_rating'] = trimmed_listings.review_scores_rating.fillna(trimmed_listings.review_scores_rating.median())
trimmed_listings['review_scores_accuracy'] = trimmed_listings.review_scores_accuracy.fillna(trimmed_listings.review_scores_accuracy.median())
trimmed_listings['review_scores_cleanliness'] = trimmed_listings.review_scores_cleanliness.fillna(trimmed_listings.review_scores_cleanliness.median())
trimmed_listings['review_scores_checkin'] = trimmed_listings.review_scores_checkin.fillna(trimmed_listings.review_scores_checkin.median())
trimmed_listings['review_scores_communication'] = trimmed_listings.review_scores_communication.fillna(trimmed_listings.review_scores_communication.median())
trimmed_listings['review_scores_location'] = trimmed_listings.review_scores_location.fillna(trimmed_listings.review_scores_location.median())
trimmed_listings['review_scores_value'] = trimmed_listings.review_scores_value.fillna(trimmed_listings.review_scores_value.median())

In [272]:
# rename to something better
trimmed_listings.rename(index=str, columns={"neighbourhood_group_cleansed": "neighbourhood"}, inplace=True)

# convert price to integer and stripe the dollar sign
trimmed_listings['price'] = trimmed_listings['price'].replace('[\$\,\.]', '', regex=True).astype(int) / 100

In [273]:
# change percentage strings to ints
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].str.replace('%', '')
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].fillna('-1')
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].astype(int)
trimmed_listings['host_response_rate'] = trimmed_listings['host_response_rate'].replace(-1, np.nan)
trimmed_listings['host_response_rate'] = trimmed_listings.host_response_rate.fillna(trimmed_listings.host_response_rate.mean())

In [274]:
# drop few records with no room information
trimmed_listings= trimmed_listings[pd.notnull(trimmed_listings['host_is_superhost'])]
trimmed_listings= trimmed_listings[pd.notnull(trimmed_listings['beds'])]
trimmed_listings= trimmed_listings[pd.notnull(trimmed_listings['bathrooms'])]
trimmed_listings= trimmed_listings[pd.notnull(trimmed_listings['bedrooms'])]

In [275]:
# remove infrequent property types
trimmed_listings = trimmed_listings[trimmed_listings.property_type.isin(['Apartment', 'House' , 'Condominium',
       'Townhouse', 'Loft', 'Bed & Breakfast'])]

In [276]:
# Used Microsoft Azure Sentiment Text Analysis API through Postman requests to identify the sentiment score 
#  of listing descriptions from 1 - 100

# Add description sentiment scores column
trimmed_listings = trimmed_listings.set_index('id').join(desc_sentiment_scores.set_index('id'))

# rename column to be more descriptive
trimmed_listings.rename(index=str, columns={"score": "description_score"}, inplace=True)

# save df without dummy variables
cleaned_base = trimmed_listings


In [277]:
# reset index bullshit
trimmed_listings.index = trimmed_listings.index.astype(int)
# trimmed_listings.index.dtype

# drop na from scores
bio_sentiment_scores = bio_sentiment_scores.dropna()
# set index for join


# join bio scores 
trimmed_listings_with_bio_scores = trimmed_listings.join(other=bio_sentiment_scores.set_index('id'), how='right')

# rename column to be more descriptive
trimmed_listings_with_bio_scores.rename(index=str, columns={"documents__score": "bio_score"}, inplace=True)



In [278]:
# trimmed_listings_with_bio_scores.bio_score
# bio_sentiment_scores.dt

In [279]:
# convert categories to dummies
trimmed_listings = pd.get_dummies(trimmed_listings)
trimmed_listings_with_bio_scores = pd.get_dummies(trimmed_listings_with_bio_scores)

In [280]:
# Rename features with spaces in their names
trimmed_listings.rename(index=str, columns={
    'neighbourhood_Beacon Hill': 'neighbourhood_Beacon_Hill',
    'neighbourhood_Capitol Hill': 'neighbourhood_Capitol_Hill',
    'neighbourhood_Central Area': 'neighbourhood_Central_Area',
    'neighbourhood_Lake City': 'neighbourhood_Lake_City',
    'neighbourhood_Other neighborhoods': 'neighbourhood_Other_neighborhoods',
    'neighbourhood_Queen Anne': 'neighbourhood_Queen_Anne',
    'neighbourhood_Rainier Valley': 'neighbourhood_Rainier_Valley',
    'neighbourhood_Seward Park': 'neighbourhood_Seward_Park',
    'neighbourhood_University District': 'neighbourhood_University_District',
    'neighbourhood_West Seattle': 'neighbourhood_West_Seattle',
    'property_type_Bed & Breakfast': 'property_type_Bed_Breakfast',
    'room_type_Entire home/apt': 'room_type_Entire_home_apt',
    'room_type_Private room': 'room_type_Private_room',
    'room_type_Shared room': 'room_type_Shared_room'
}, inplace=True)


trimmed_listings_with_bio_scores.rename(index=str, columns={
    'neighbourhood_Beacon Hill': 'neighbourhood_Beacon_Hill',
    'neighbourhood_Capitol Hill': 'neighbourhood_Capitol_Hill',
    'neighbourhood_Central Area': 'neighbourhood_Central_Area',
    'neighbourhood_Lake City': 'neighbourhood_Lake_City',
    'neighbourhood_Other neighborhoods': 'neighbourhood_Other_neighborhoods',
    'neighbourhood_Queen Anne': 'neighbourhood_Queen_Anne',
    'neighbourhood_Rainier Valley': 'neighbourhood_Rainier_Valley',
    'neighbourhood_Seward Park': 'neighbourhood_Seward_Park',
    'neighbourhood_University District': 'neighbourhood_University_District',
    'neighbourhood_West Seattle': 'neighbourhood_West_Seattle',
    'property_type_Bed & Breakfast': 'property_type_Bed_Breakfast',
    'room_type_Entire home/apt': 'room_type_Entire_home_apt',
    'room_type_Private room': 'room_type_Private_room',
    'room_type_Shared room': 'room_type_Shared_room'
}, inplace=True)

In [281]:
trimmed_listings.to_csv("./data/clean/cleaned-listings-dummies.csv")
trimmed_listings_with_bio_scores.to_csv("./data/clean/cleaned-listings-dummies-bio.csv")
cleaned_base.to_csv("./data/clean/cleaned-listings-base.csv")