In [21]:
import pandas as pd
import numpy as np

In [22]:
# Open boston data
boston_calendar = pd.read_csv("boston_airbnb_data/calendar.csv")
boston_listings = pd.read_csv("boston_airbnb_data/listings.csv")
boston_reviews = pd.read_csv("boston_airbnb_data/reviews.csv")

In [23]:
# Open seattle data
seattle_calendar = pd.read_csv("seattle_airbnb_data/calendar.csv")
seattle_listings = pd.read_csv("seattle_airbnb_data/listings.csv")
seattle_reviews = pd.read_csv("seattle_airbnb_data/reviews.csv")

### Understanding calendar data for both cities

In [63]:
# Quick explore of number of rows and columns of calendar data
print("Boston calendar data have {} columns and {} rows and Seattle calendar data have {} columns and {} rows".format(str(boston_calendar.shape[1]),str(boston_calendar.shape[0]),str(seattle_calendar.shape[1]),str(seattle_calendar.shape[0])))

Boston calendar data have 4 columns and 1308890 rows and Seattle calendar data have 4 columns and 1393570 rows


In [61]:
# Columns and type for boston calendar
boston_calendar.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [62]:
# Columns and type for seattle calendar
seattle_calendar.dtypes

listing_id     int64
date          object
available     object
price         object
dtype: object

In [59]:
# Example of boston calendar df
boston_calendar.head()

Unnamed: 0,listing_id,date,available,price
0,12147973,2017-09-05,f,
1,12147973,2017-09-04,f,
2,12147973,2017-09-03,f,
3,12147973,2017-09-02,f,
4,12147973,2017-09-01,f,


In [64]:
# Example of seattle calendar df
seattle_calendar.head()

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,


In [6]:
# Dates are read as string, it is convenient to convert them to datetime format
boston_calendar['date'] = pd.to_datetime(boston_calendar['date'], format='%Y-%m-%d')
seattle_calendar['date'] = pd.to_datetime(seattle_calendar['date'], format='%Y-%m-%d')

In [7]:
# Date range for boston and seattle calendar
min_date_boston_calendar = boston_calendar["date"].min()
max_date_boston_calendar = boston_calendar["date"].max()
min_date_seattle_calendar = seattle_calendar["date"].min()
max_date_seattle_calendar = seattle_calendar["date"].max()

print("Boston calendar goes from: {} to {} and Seattle calendar goes from: {} to {}".format(min_date_boston_calendar,max_date_boston_calendar,min_date_seattle_calendar,max_date_seattle_calendar))

Boston calendar goes from: 2016-09-06 00:00:00 to 2017-09-05 00:00:00 and Seattle calendar goes from: 2016-01-04 00:00:00 to 2017-01-02 00:00:00


Relevant observations:
1. Boston calendar dates goes from september 2016 to september 2017 and Seattle calendar dates goes from january 2016 to january 2017.
2.  Available column have t and f str values referencing true and false (boolean values), it will be left the same because later the categorical variables will be used as dummy variables
3. Price column is read as string type due to the dollar sign, later it should become type float
4. We can concat both dataframes if we are going to work with data from both cities

In [12]:
boston_reviews.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')

In [39]:
boston_reviews.iloc[1001,5]

'Perry was a wonderful host (familiar with the area and had great suggestions for things to do).  The accommodations were luxurious and pristine.  Highly recommend!'

In [34]:
print(set(boston_listings.columns))

{'instant_bookable', 'interaction', 'cancellation_policy', 'last_review', 'calculated_host_listings_count', 'scrape_id', 'review_scores_value', 'beds', 'weekly_price', 'host_neighbourhood', 'monthly_price', 'host_since', 'number_of_reviews', 'review_scores_accuracy', 'is_location_exact', 'host_verifications', 'id', 'calendar_updated', 'reviews_per_month', 'host_has_profile_pic', 'guests_included', 'price', 'host_name', 'transit', 'review_scores_rating', 'medium_url', 'property_type', 'require_guest_profile_picture', 'neighborhood_overview', 'description', 'security_deposit', 'availability_90', 'neighbourhood_group_cleansed', 'host_response_time', 'thumbnail_url', 'jurisdiction_names', 'bedrooms', 'maximum_nights', 'minimum_nights', 'review_scores_communication', 'require_guest_phone_verification', 'name', 'host_acceptance_rate', 'bathrooms', 'market', 'extra_people', 'state', 'country_code', 'summary', 'review_scores_location', 'square_feet', 'host_is_superhost', 'host_listings_count',

In [38]:
print(set(boston_listings.iloc[0,:]))

{nan, 'You will have access to 2 bedrooms, a living room, kitchen, bathrooms, and yard.', 1, nan, 31303940, 12147973, 4, -71.13306792912681, 1.5, 'We are country and city connecting in our deck and garden. Enjoy our music room, books and flat screen TV with the pastoral backyard of hens, bees, rabbits and an organic garden.  ', 2.0, 3.0, nan, nan, 0, nan, 't', 'https://a2.muscache.com/im/pictures/c0842db1-ee98-4fe8-870b-d1e2af33855d.jpg?aki_policy=x_large', 'The house has an open and cozy feel at the same time.  The living room has a flat screen TV.  The kitchen has all you need for cooking.  We prefer you buy your food but can use the organic oils, herbs, etc.   The yard can be seen from sitting room and when the weather allows, the yard is a place children can lose themselves in a safe way.  We have 2 bee hives, 6 hens fenced in (sometimes they get out of their coop area & into the yard), 2 rabbits in a hutch and play structure.', nan, 'Roslindale', nan, nan, 'Sunny Bungalow in the C