# Calendar integration, part A

This Jupyter Notebook allows a researcher to load in and organize the compressed “calendar.csv.gz” files available from Inside Airbnb. These calendar files contain information about a given listing’s availability and pricing for the next 365 days starting from the date of the scrape. The daily pricing and availability information provides greater detail on how hosts adjust prices to account for weekends, holidays and other special events throughout the year. The headline prices contained in the previous “1st_stage_panel_data_cleaning” notebook greatly understate the variation in prices that Airbnb consumers actually face.

The current Notebook also calculates the number of bookings for properties by leveraging changes in availability across scrape dates and attempts to find the price associated with these bookings by using the last seen price before the property became unavailable. This approach shares similarities to the approach in Williams (2017), *Dynamic Airline Pricing and Seat Availability*, in which the author identifies airline bookings by observing changes in the seat availability on flights. Since scrapes on Inside Airbnb occur on the monthly basis, the measures for bookings will be relatively crude. I leverage review data to further augment changes in availability in the second part of the calendar integration work (“Calendar_integration_partB”).

In [1]:
import os
import pandas as pd
import numpy as np
from functools import reduce
from datetime import datetime
import warnings
import time
from pandas.core.common import SettingWithCopyWarning

In [2]:
# Select city to work with

city_folder = 'united-states_portland/'
city_abbrev = 'POR'

In [3]:
# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Set paths
graphics_folder = cwd2 + '/3. Graphics/'
data_path = cwd2 + '/Saved data/'
csv_path = cwd2 + '/0. Raw data/' + city_folder

# Revert to preliminary directory
os.chdir(cwd1)

# 1. Preliminary processing

Select relevant calendar files, load in previously created dataframe on monthly listings, and set some criteria by which to clean the calendar files.

It is also necessary to adjust for the fact that some monthly scrapes took multiple days. The reason for this adjustment is that that there will be different amounts of dates in scrapes that took longer to complete.

In [4]:
# This loop aggregates all calendar file names
calFiles = []
fileNames = os.listdir(csv_path)
for fileNames in fileNames:
    if fileNames.endswith("calendar.csv.gz"):
        calFiles.append(fileNames)
        
file_dates = []

for i in range(len(calFiles)):
    file_dates.append(calFiles[i].split('_')[2])
    
init_dates = file_dates
init_dates = np.sort(init_dates)

In [5]:
# Read 1st stage cleaned dataframe
os.chdir(data_path)

dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
listings_df = pd.read_csv(city_abbrev + '_1stStageClean.csv.gz', compression = 'gzip', low_memory=False, parse_dates=dateCols)


In [6]:
# Use only the calendar dates that exist in the cleaned 1st stage file

# Find month-yr for calendar files and for the clean 
# listings to create a flag.

calfile_MOYR = np.sort(init_dates).astype('datetime64[M]')
cleanlistings_MOYR = np.sort(listings_df['batch_YRMO'].unique()).astype('datetime64[M]')

# Identify which dates are contained in the clean listings data
included_in_clean_flag = np.isin(calfile_MOYR,
        cleanlistings_MOYR)

init_dates = init_dates[included_in_clean_flag].astype('datetime64[D]').astype(str)

In [7]:
def complex_extract_cal_list(date, cal_start, cal_end, update_cutoff=90, avail_cutoff=0):

    """
    This function removes listings that appear dormant 
    on Airbnb from the calendar files that we will be using.
    
    Dormancy is determined by when the calendar was last updated, whether a property is instant bookable,
    and based on future availability of the property.
    """
    
    warnings.simplefilter("ignore", category=SettingWithCopyWarning)

    # Managing listings dataframe 
    dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
    
    list_file = pd.read_csv("united-states_portland_" + date + "_listings.csv.gz", 
                            low_memory = False, parse_dates = dateCols)
    
    small = list_file['calendar_updated'].str.split(' ', 3, expand=True)
    small.columns = ['count', 'measure', 'length']
    small = small[['count', 'measure']]
    small = small.replace(["days", 'week', 'weeks', 'months', 'today', 'never','yesterday'], 
                          [1, 7, 7, 30, 0, 999, 1])
    
    small['count'] = small['count'].replace('a', 1)
    small = small.fillna(1)

    list_file.loc[:, 'update_numeric'] = small['count'].astype(float)*small['measure'].astype(float)

    checks = list_file[['update_numeric', 'instant_bookable', 'last_scraped', 'last_review', 'availability_365']]
    checks.loc[:, 'DSR'] = (checks['last_scraped'] - checks['last_review']).dt.days
    checks = checks[['update_numeric', 'instant_bookable', 'DSR', 'availability_365']]
    

    list_file.loc[:, 'DSR'] = checks['DSR']
    list_file.loc[:, 'cal_trust'] = (((checks['update_numeric'] <= update_cutoff) | (checks['instant_bookable'] =='t') 
                                     | (checks['availability_365'] > avail_cutoff))*1) #      
    trusted_ids = list_file[list_file['cal_trust'] == 1]['id'].unique()
    
    list_file['price'] = list_file.price.replace('[\$,]', '', regex=True).astype(float)

     # Reduce the size of variables in the listings dataframe.
    list_file_small = list_file[['id', 'price', 'host_listings_count','bedrooms', 'bathrooms', 
                                 'neighbourhood', 'zipcode', 'room_type', 'instant_bookable', 
                                 'update_numeric', 'DSR', 'cal_trust', 'availability_365']]
    
    list_file_small.columns = ['id', 'headline_price', 'host_lists', 'bedrooms', 
                               'bathrooms', 'neigh', 'zip', 'type', 'instant', 
                               'DSupdate', 'DSReview', 'Active_flag', 'avail365']
    # Managing calendar dataframe
    cal_file = pd.read_csv("united-states_portland_" + date + "_calendar.csv.gz", compression = 'gzip')

    # This is needed because newer calendar files add cols
    cal_file = cal_file[['listing_id', 'date','available', 'price']] 

    # Deal with different scrape times
    cal_file = cal_file[(cal_file['date'] >= cal_start)& (cal_file['date'] <= cal_end)] 
    
    # Prices converted to floats
    cal_file['price'] = cal_file.price.replace('[\$,]', '', regex=True).astype(float)   
    
    print("----Cleaning calendar and listings data for " + date + "----")
    print("Listing-dates removed: " + str(float(len(cal_file) - len(cal_file[cal_file['listing_id'].isin(trusted_ids)]))))
    print("Unique listings removed: " + str(float((len(cal_file) - len(cal_file[cal_file['listing_id'].isin(trusted_ids)]))/365)))
    print("                                                           ")
    
    cal_file = cal_file[cal_file['listing_id'].isin(trusted_ids)] # Removes questionable calendar data
    # Rename columns
    cal_file.columns = ['id', 'date', 'avail', 'night_price']
    
    return cal_file, list_file_small

In [8]:
# This loop simply identifies when a scrape took more than one day, 
# and creates an adjustment layer based on this information.

os.chdir(csv_path)
i = 0
adjustment_layer = []
for date in init_dates:
    dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
    list_file = pd.read_csv("united-states_portland_" + date + "_listings.csv.gz", low_memory = False, parse_dates = dateCols)

    if len(list_file.last_scraped.unique()) > 1:
        
        print("More than one scrape date in file indexed " + str(i) + "!")
        print(list_file.last_scraped.unique())
        adjustment_layer.append(len(list_file.last_scraped.unique()))
    else:
#         print("*No* issue with file index " + str(i))
        adjustment_layer.append(0)
    i += 1

More than one scrape date in file indexed 15!
['2017-03-06T00:00:00.000000000' '2017-03-05T00:00:00.000000000']
More than one scrape date in file indexed 16!
['2017-04-07T00:00:00.000000000' '2017-04-08T00:00:00.000000000']
More than one scrape date in file indexed 25!
['2018-01-16T00:00:00.000000000' '2018-01-17T00:00:00.000000000']
More than one scrape date in file indexed 27!
['2018-04-11T00:00:00.000000000' '2018-04-12T00:00:00.000000000']
More than one scrape date in file indexed 28!
['2018-05-14T00:00:00.000000000' '2018-05-13T00:00:00.000000000']
More than one scrape date in file indexed 32!
['2018-10-09T00:00:00.000000000' '2018-10-12T00:00:00.000000000']
More than one scrape date in file indexed 33!
['2018-11-07T00:00:00.000000000' '2018-11-09T00:00:00.000000000']


In [9]:
# This section creates a list of calendar and listing frames for each date range.

dates_in_datetime = init_dates.astype('datetime64[D]')

my_cals = []
my_lists = []
for i in range (0, len(dates_in_datetime)-1):
    if (adjustment_layer[i]!=0):
        cal, listing = complex_extract_cal_list(str(dates_in_datetime[i]), 
                                                str(dates_in_datetime[i] + adjustment_layer[i]), 
                                                str(dates_in_datetime[i] + 365 - (adjustment_layer[i]-1)))    
        my_cals.append(cal)
        my_lists.append(listing)
    else:
        cal, listing = complex_extract_cal_list(str(dates_in_datetime[i]), 
                                                str(dates_in_datetime[i] + adjustment_layer[i]), 
                                                str(dates_in_datetime[i] + 364))
        my_cals.append(cal)
        my_lists.append(listing)

----Cleaning calendar and listings data for 2015-09-02----
Listing-dates removed: 730.0
Unique listings removed: 2.0
                                                           
----Cleaning calendar and listings data for 2015-11-02----
Listing-dates removed: 1825.0
Unique listings removed: 5.0
                                                           
----Cleaning calendar and listings data for 2015-12-02----
Listing-dates removed: 4015.0
Unique listings removed: 11.0
                                                           
----Cleaning calendar and listings data for 2016-01-01----
Listing-dates removed: 4015.0
Unique listings removed: 11.0
                                                           
----Cleaning calendar and listings data for 2016-02-03----
Listing-dates removed: 25116.0
Unique listings removed: 68.81095890410958
                                                           
----Cleaning calendar and listings data for 2016-04-05----
Listing-dates removed: 41610.0
Uniq

# 2. Select listings ids and dates of interest

Appropriate listings and dates will depend on a researcher’s specific focus, here ids are chosen based on listings are selected based on criteria set-up in a previous notebook and dates are set to avoid 2020.

In [10]:
listings_df = listings_df[(listings_df['scrape_batch'] >= str(dates_in_datetime[0])) 
                          & (listings_df['scrape_batch'] <= str(dates_in_datetime[-1]))]

In [11]:
# Focus on ids that have been kept after listings data cleaning process (listings_df)

ids = []
unq_dates = []

for cal in my_cals:
    ids.append(cal['id'].unique())
    unq_dates.append(cal['date'].unique())

all_ids = np.sort(np.unique(np.concatenate(ids)))    
all_dates = np.sort(np.unique(np.concatenate(unq_dates)))

listings_df = listings_df[listings_df['drop_indicator'] == 0]

# See if the data cleaning dropped any of the selected listings:
id_mask = np.isin(all_ids, listings_df['id'].unique())

# Could Avoid 2020 if worried about COVID-19
# date_mask = all_dates < '2020-01-01' 

all_ids = all_ids[id_mask]
all_dates = all_dates[date_mask]

print(len(all_ids), len(all_dates))

8410 1582


# 3. Function to identify bookings and their associated prices

Function is created to identify bookings and prices for all listings in the selected sample. This function leverages numpy arrays to improve efficiency. The larger these numpy arrays become, the greater the strain on a computer’s memory. It will usually be necessary to process data in separate chunks.

There are two ways in which a property is considered booked:

* **Available to unavailable:** 
     Firstly a property is booked if the sum of the avail_change column for a 
     given property across various scrapes is equal to -1. This represents the 
     fact that a property was seen as available and then it became unavailable. Going from avail = 1 to avail = 0 
     means that we subract 1 for this variable.
 
* **Allowing for unavailable to available to unavailable:** 
    I want to allow for a property to be 
    unavailable, become available and then switch 
    back to being unavailable. To achieve this, I 
    set three conditions:

    * **Cond 2.1:** Sum of the avail_change row is 0, 
    this allows for a property to go 
    from unavailable to available (1) 
    and then go from available to 
    unavailable (-1)

    * **Cond 2.2:** Second, the last availability seen 
    must be equal to zero

    * **Cond 2.3:** Thirdly, the property must have been 
    seen as available at some point so we
    must have a 1 in the avail_matrix

In [12]:
def cal_file(df, unq_dates, unq_ids):

    """
    Function ensures that the calendar file 
    only contains ids and dates chosen by the researcher.
    """
    
    return df[(df['date'].isin(unq_dates)) & 
              (df['id'].isin(unq_ids))].sort_values(by=['id','date']).reset_index(drop = True)

def highest_index(a):
    
    """
    Function finds the highest index for an array that is not null.
    If all values are null then return 99999.
    """
    
    try:
        return a[~np.isnan(a)][-1]
    except: 
        return 99999
    
def last_nonzero(arr, axis, invalid_val=-1):
    """
    Function finds the last non-zero value of an array.
    """

    mask = arr!= "nan"
    val = arr.shape[axis] - np.flip(mask,axis=axis).argmax(axis=axis) - 1

    return np.where(mask.any(axis=axis), val, invalid_val)

In [13]:
def process(init=0,stop=100):
    """
    This function produces and saves a final booked_df for ids within the range
    init and stop in the all_ids array.
    """
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    unique_ids = all_ids[init:stop]
    unique_dates = all_dates 

    # Save the number of ids and dates

    N_IDS = len(unique_ids) 
    N_DATES = len(unique_dates) 
    key_list = []

# =========================================================    

    for an_id in unique_ids:
        for a_date in unique_dates:
            key_list.append(an_id.astype(str) + ":" + a_date)
    key_df = pd.DataFrame(key_list, columns=['key'])

# =========================================================

    calendar_scrapes = []

    for my_cal in my_cals:
        calendar_scrapes.append(cal_file(my_cal, unique_dates, unique_ids))

    N_CALS = len(calendar_scrapes)
    i = 1
    
    for cal_files in calendar_scrapes:
        
        cal_files.loc[:, 'key'] = cal_files['id'].astype(str).str.strip() + ":" + cal_files['date'].str.strip()
        cal_files.columns = ['id' + str(i), 'date' + str(i), 'avail' + str(i), 'night_price' + str(i),
                        'key']   
        i+=1

# =========================================================

# Create a dataframe for all availabilities and prices for every listing-night
# across scrape files. 
#    * Note: This dataframe is broken up into separate availability and price matrices
        
    cal_dfs = [key_df]

    for i in range(N_CALS):
        count = i + 1 
        cal_dfs.append(calendar_scrapes[i][[('avail' + str(count)), 'key', ('night_price' + str(count))]])
    

    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['key'],
                                            how='outer'), cal_dfs).fillna(np.nan) 

    df_merged.loc[:, "id"], df_merged.loc[:, "date"] = (df_merged['key'].str.split(":", expand=True)[0].astype(int),
                                              df_merged['key'].str.split(":", expand=True)[1])
    df_merged = df_merged.drop(columns=['key'])

    df_merged = df_merged.drop_duplicates()

# =========================================================

    # Create a list of columns with only availabilities and the id and date
    avail_cols = df_merged.columns[df_merged.columns.str.contains('avail')]
    avail_cols = np.append(avail_cols, ('id','date'))

    avail_arr = df_merged[avail_cols].values

    avail_matrix = np.reshape(avail_arr[:,0:N_CALS], (int(N_DATES), N_IDS, N_CALS), order='F') 

    # Convert 't' and 'f' strings into binary floats
    avail_matrix = np.where(avail_matrix == 't', 1., avail_matrix) 
    avail_matrix = np.where(avail_matrix == 'f', 0., avail_matrix)
    
    # Take the difference between availability status across scrape files
    avail_change = np.diff(avail_matrix, axis=2)
    
# =========================================================
    
    case_1 = (np.nansum(avail_change, axis=2) == -1)*1 
# =========================================================    

    str_avail = avail_matrix.astype(str)
    inds = last_nonzero(str_avail, axis=2, invalid_val=np.nan)

    test_inds = inds
    test_inds = np.reshape(test_inds, (N_DATES*N_IDS), order='F')

# =========================================================
    # Get the last availability for all properties
    
    case2_df = pd.DataFrame(test_inds)
    case2_df.columns = ['cal_index']
    case2_df['cal_index'] = case2_df['cal_index'].astype('Int64')
    case2_df['cal_index'] = case2_df['cal_index'].fillna(0) 
    my_vals = case2_df['cal_index'].values 

    reshape_avail = np.reshape(avail_matrix, (N_DATES*N_IDS*N_CALS,), order='F')

    case2_df['id_index'] = (case2_df.index/N_DATES).astype(int)
    case2_df['date_index'] = case2_df.index - N_DATES*case2_df['id_index']


    indices = (case2_df['cal_index']*N_DATES*N_IDS + case2_df['id_index']*N_DATES + case2_df['date_index']).values

    case2_result = reshape_avail[indices.astype(int)]
    case2_result = np.reshape(case2_result, (N_DATES, N_IDS), order='F').astype(float)
    
# =========================================================

    cond_21 = (np.nansum(avail_change, axis=2) == 0)*1
    cond_22 = (case2_result == 0)*1
    cond_23 = (np.nanmax(avail_matrix,axis=2) == 1)*1

    case_2 = cond_21*cond_22*cond_23
    
# =========================================================
    # CREATE A MATRIX FOR BOOKINGS
    
    booked_mat = np.maximum(case_1, case_2)
    booked_mat = np.reshape(booked_mat, (N_DATES*N_IDS,1), order='F')

    booked_df = pd.DataFrame(booked_mat >= 1 , columns=['booked'])
    booked_df.loc[:, "id"] = df_merged['id'].values
    booked_df.loc[:, "res_date"] = df_merged['date'].values

    never_avail = np.nanmax(avail_matrix,axis=2) != 1.
    booked_df.loc[:, 'never_avail'] = np.reshape(never_avail, (N_DATES*N_IDS, 1), order='F')
    
# =========================================================
    # CREATE A MATRIX FOR PRICES
    
    price_cols = df_merged.columns[df_merged.columns.str.contains('night_price')]
    price_cols = np.append(price_cols, ('id','date'))

    price_arr = df_merged[price_cols].values

    price_matrix = np.reshape(price_arr[:,0:N_CALS], (int(N_DATES), N_IDS, N_CALS), order='F')
    
# =========================================================
    
    last_avail_flag = (np.roll(avail_change,0, axis=2) == -1)*1
    last_avail_flag = np.where(last_avail_flag == 0, np.nan, last_avail_flag)

    booked_price = np.nanmax(price_matrix[:,:,0:(N_CALS - 1)]*last_avail_flag,axis=2)
    booked_df.loc[:, 'price'] = np.reshape(booked_price, (N_DATES*N_IDS,1), order='F')
    last_date_avail = np.isfinite(last_avail_flag).argmax(2)
    index_for_last = np.reshape(last_date_avail, (N_DATES*N_IDS,), order='F')
    last_date = np.array(init_dates)[np.reshape(last_date_avail, (N_DATES*N_IDS,), order='F')]

    # This ensures that the last date is only reported whenever the listing is identified as being booked.
    booked_df.loc[:, "last_date"] = last_date
    mask = booked_df.booked == False
    column_name = 'last_date'
    booked_df.loc[mask, column_name] = np.nan
    
# =========================================================
    
    # Add prices for places that are available but not booked
    price_matrix_np = np.float64(price_matrix)
    
    last_observed_price = np.apply_along_axis(highest_index, 2, price_matrix_np)
    last_observed_price = np.where(last_observed_price == 99999, np.nan, last_observed_price)
    last_observed_price = np.reshape(last_observed_price, (N_IDS*N_DATES), order='F')
    
# =========================================================

    booked_df = booked_df.sort_values(by=['id', 'res_date'])
    booked_df.loc[:, "all_prices"] =  last_observed_price
    
    # If booked, get assigned "price" if un-booked get assigned "all_prices". Final prices is the variable we care about
    booked_ind = (booked_df['booked']*1).values

    final_prices = ((booked_ind)*booked_df['price'].fillna(0) + (1 - booked_ind)*booked_df['all_prices'].fillna(0)).values
    booked_df.loc[:, 'final_prices'] = final_prices
    booked_df['final_prices'].replace({0.:np.nan}, inplace=True)
    booked_df.loc[:, "seen_avail"] = (booked_df['never_avail'] == False).values*1
    booked_df = booked_df.sort_values(by='res_date').reset_index(drop=True)    
    booked_df.loc[:, 'week_yr'] =  booked_df['res_date'].astype('datetime64').dt.strftime('%Y-%U')
    booked_df.loc[:, 'mo_yr'] = booked_df['res_date'].astype('datetime64').dt.to_period('M')
    booked_df_save = booked_df.copy()
    booked_df_save.loc[:, "mo_yr"] = booked_df_save["mo_yr"].astype(str)
    
# =========================================================
    # APPEND TO LIST
    
    os.chdir(data_path) #MOVE THIS !
    df_list.append(booked_df_save)
    
#     print('chunk '+ str(j) + " completed") # Creates many prints, helpful for error checking

# Processing function Call and Save

In [14]:
chunk_size = 100 # Chunk size determined through trial and error
N_chunks = int(np.ceil(len(all_ids)/chunk_size))

In [15]:
# Iterative loop to prevent kernel crash

df_list = []

j = 0
for i in range(N_chunks):
    process(init=i*chunk_size, stop=(i + 1)*chunk_size)
    j+=1
    
booked_df_final = pd.concat(df_list)
booked_df_final.to_csv(city_abbrev + '_rejoined_booked_df.csv.gz', 
                       index=False, compression='gzip')

print("Process complete!")

Process complete!
