# Calendar_integration_partA

## Created June 19, 2020

This file is the first half of the file "CALENDAR DATA_05", and it produces chunked dataframes named "booked_df_"x".csv". 

# Import libraries and directory setup

In [15]:
import os
import pandas as pd
import numpy as np
from functools import reduce
from datetime import datetime
import warnings
from pandas.core.common import SettingWithCopyWarning

In [16]:
# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Set paths
graphics_folder = cwd2 + '/3. Graphics/'
data_path = cwd2 + '/Saved data/'
csv_path = cwd2 + '/0. Raw data/'
# Revert to preliminary directory
os.chdir(cwd1)

# Preliminary processing

In [17]:
# This loop aggregates all calendar file names
calFiles = []
fileNames = os.listdir(csv_path)
for fileNames in fileNames:
    if fileNames.endswith("calendar.csv.gz"):
        calFiles.append(fileNames)
        
file_dates = []

for i in range(len(calFiles)):
    file_dates.append(calFiles[i].split('_')[2])
    
init_dates = file_dates
del init_dates[9]
del init_dates[16]
print(init_dates)

['2018-09-14', '2016-09-04', '2017-06-05', '2017-03-05', '2018-01-16', '2017-11-13', '2018-04-11', '2016-04-05', '2017-09-12', '2017-10-04', '2016-11-06', '2018-12-10', '2017-07-06', '2018-05-13', '2016-08-04', '2018-02-08', '2017-01-04', '2017-08-06', '2016-01-01', '2018-11-07', '2019-01-13', '2015-12-02', '2016-12-08', '2015-11-02', '2017-05-07', '2019-02-06', '2016-05-03', '2017-12-09', '2016-02-03', '2018-10-09', '2018-08-14', '2018-07-10', '2016-06-03', '2017-04-07', '2016-07-04', '2017-02-09', '2015-09-02']


In [18]:
#Read 2nd stage cleaned dataframe
os.chdir(data_path)

dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
listings_df = pd.read_csv('2ndStageClean_Portland.csv.gz', compression = 'gzip', low_memory=False, parse_dates=dateCols)


In [25]:
    
def complex_extract_cal_list(date, cal_start, cal_end, update_cutoff=90, avail_cutoff=0):
    warnings.simplefilter("ignore", category=SettingWithCopyWarning)
    """
    The point of this function is to remove listings that seem dormant 
    on Airbnb from the calendar files that we will be using.
    """
    # Managing listings dataframe 
    dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
    list_file = pd.read_csv("united-states_portland_" + date + "_listings.csv", low_memory = False, parse_dates = dateCols)
    
    # Here I try to identify which listing calendars should be trusted, I try not to be too harsh.
    small = list_file['calendar_updated'].str.split(' ', 3, expand=True)
    small.columns = ['count', 'measure', 'length']
    small = small[['count', 'measure']]
    small = small.replace(["days", 'week', 'weeks', 'months', 'today', 'never','yesterday'], 
                          [1, 7, 7, 30, 0, 999, 1])
    small['count'] = small['count'].replace('a', 1)
    small = small.fillna(1)

    list_file.loc[:, 'update_numeric'] = small['count'].astype(float)*small['measure'].astype(float)

    checks = list_file[['update_numeric', 'instant_bookable', 'last_scraped', 'last_review', 'availability_365']]
    checks.loc[:, 'DSR'] = (checks['last_scraped'] - checks['last_review']).dt.days
    checks = checks[['update_numeric', 'instant_bookable', 'DSR', 'availability_365']]
    

    list_file.loc[:, 'DSR'] = checks['DSR']
    list_file.loc[:, 'cal_trust'] = (((checks['update_numeric'] <= update_cutoff) | (checks['instant_bookable'] =='t') 
                                     | (checks['availability_365'] > avail_cutoff))*1) #      
    trusted_ids = list_file[list_file['cal_trust'] == 1]['id'].unique()
    
    list_file['price'] = list_file.price.replace('[\$,]', '', regex=True).astype(float)

     # Reduce the size of variables in the listings dataframe.
    list_file_small = list_file[['id', 'price', 'host_listings_count','bedrooms', 'bathrooms', 'neighbourhood', 'zipcode', 'room_type', 
                                 'instant_bookable', 'update_numeric', 'DSR', 'cal_trust', 'availability_365']]
   
    list_file_small.columns = ['id', 'headline_price', 'host_lists', 'bedrooms', 
                               'bathrooms', 'neigh', 'zip', 'type', 'instant', 
                               'DSupdate', 'DSReview', 'Active_flag', 'avail365']

    # Managing calendar dataframe
    cal_file = pd.read_csv("united-states_portland_" + date + "_calendar.csv.gz", compression = 'gzip')
    cal_file = cal_file[['listing_id', 'date','available', 'price']] # This is needed because newer calendar files add cols
    cal_file = cal_file[(cal_file['date'] >= cal_start)& (cal_file['date'] <= cal_end)] # Deal with different scrape times
    cal_file['price'] = cal_file.price.replace('[\$,]', '', regex=True).astype(float)   # Prices converted to floats
    
    print("----Cleaning calendar and listings data for " + date + "----")
    print("Listing-dates removed: " + str(float(len(cal_file) - len(cal_file[cal_file['listing_id'].isin(trusted_ids)]))))
    print("Unique listings removed: " + str(float((len(cal_file) - len(cal_file[cal_file['listing_id'].isin(trusted_ids)]))/365)))
    print("                                                           ")
    
    cal_file = cal_file[cal_file['listing_id'].isin(trusted_ids)] # Removes questionable calendar data
   
    # Rename columns
    cal_file.columns = ['id', 'date', 'avail', 'night_price']
    
    return cal_file, list_file_small

In [26]:
#listings_df.loc[:, 'List_month'] = listings_df['List_month'] - listings_df['Occasional_LTR'] 

#listings_df = listings_df[listings_df['Occasional_LTR'] == 0] # This may mess up some of the dimensionsl

os.chdir(csv_path)

# Initial data cleaning and merge

In [27]:
#This section focuses on date formatting and array creation:
arr_dates_init = np.array(init_dates).astype('datetime64')
arr_dates_init.sort()
arr_dates = arr_dates_init
print(arr_dates)

['2015-09-02' '2015-11-02' '2015-12-02' '2016-01-01' '2016-02-03'
 '2016-04-05' '2016-05-03' '2016-06-03' '2016-07-04' '2016-08-04'
 '2016-09-04' '2016-11-06' '2016-12-08' '2017-01-04' '2017-02-09'
 '2017-03-05' '2017-04-07' '2017-05-07' '2017-06-05' '2017-07-06'
 '2017-08-06' '2017-09-12' '2017-10-04' '2017-11-13' '2017-12-09'
 '2018-01-16' '2018-02-08' '2018-04-11' '2018-05-13' '2018-07-10'
 '2018-08-14' '2018-09-14' '2018-10-09' '2018-11-07' '2018-12-10'
 '2019-01-13' '2019-02-06']


In [28]:
init_dates_2 = list((arr_dates).astype(str))

In [29]:
# This loop simply identifies when a scrape took more than one day, 
# and creates an adjustment layer based on this information
os.chdir(csv_path)
i = 0
adjustment_layer = []
for date in init_dates_2:
    dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']
    list_file = pd.read_csv("united-states_portland_" + date + "_listings.csv", low_memory = False, parse_dates = dateCols)
    print(i)
    if len(list_file.last_scraped.unique()) > 1:
        print("Issue!")
        print(list_file.last_scraped.unique())
        adjustment_layer.append(len(list_file.last_scraped.unique()))
    else:
        print("No issue")
        adjustment_layer.append(0)
    i += 1
adjustment_layer

0
No issue
1
No issue
2
No issue
3
No issue
4
No issue
5
No issue
6
No issue
7
No issue
8
No issue
9
No issue
10
No issue
11
No issue
12
No issue
13
No issue
14
No issue
15
Issue!
['2017-03-06T00:00:00.000000000' '2017-03-05T00:00:00.000000000']
16
Issue!
['2017-04-07T00:00:00.000000000' '2017-04-08T00:00:00.000000000']
17
No issue
18
No issue
19
No issue
20
No issue
21
No issue
22
No issue
23
No issue
24
No issue
25
Issue!
['2018-01-16T00:00:00.000000000' '2018-01-17T00:00:00.000000000']
26
No issue
27
Issue!
['2018-04-11T00:00:00.000000000' '2018-04-12T00:00:00.000000000']
28
Issue!
['2018-05-14T00:00:00.000000000' '2018-05-13T00:00:00.000000000']
29
No issue
30
No issue
31
No issue
32
Issue!
['2018-10-09T00:00:00.000000000' '2018-10-12T00:00:00.000000000']
33
Issue!
['2018-11-07T00:00:00.000000000' '2018-11-09T00:00:00.000000000']
34
No issue
35
No issue
36
No issue


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0]

In [30]:
#This section creates a list of calendar and listing frames for each date range.
my_cals =[]
my_lists=[]
for i in range (0,len(init_dates_2)-1):
    if (adjustment_layer[i]!=0):
        cal, listing = complex_extract_cal_list(str(arr_dates[i]), str(arr_dates[i] + adjustment_layer[i]), str(arr_dates[i] + 365 - (adjustment_layer[i]-1)))    
        my_cals.append(cal)
        my_lists.append(listing)
    else:
        cal, listing = complex_extract_cal_list(str(arr_dates[i]), str(arr_dates[i] + adjustment_layer[i]), str(arr_dates[i] + \
        (364)))
        my_cals.append(cal)
        my_lists.append(listing)
                                                
    

----Cleaning calendar and listings data for 2015-09-02----
Listing-dates removed: 730.0
Unique listings removed: 2.0
                                                           
----Cleaning calendar and listings data for 2015-11-02----
Listing-dates removed: 1825.0
Unique listings removed: 5.0
                                                           
----Cleaning calendar and listings data for 2015-12-02----
Listing-dates removed: 4015.0
Unique listings removed: 11.0
                                                           
----Cleaning calendar and listings data for 2016-01-01----
Listing-dates removed: 4015.0
Unique listings removed: 11.0
                                                           
----Cleaning calendar and listings data for 2016-02-03----
Listing-dates removed: 25116.0
Unique listings removed: 68.81095890410958
                                                           
----Cleaning calendar and listings data for 2016-04-05----
Listing-dates removed: 41610.0
Uniq

# Utilization of a more Pythonic method

In [31]:
listings_df = listings_df[(listings_df['scrape_batch'] >= str(arr_dates[0])) & (listings_df['scrape_batch'] <= str(arr_dates[-1]))]

In [32]:
# Focus on ids that have been kept after listings data cleaning process (listings_df)

ids = []
unq_dates = []

for cal in my_cals:
    ids.append(cal['id'].unique())
    unq_dates.append(cal['date'].unique())

all_ids = np.sort(np.unique(np.concatenate(ids)))    
all_dates = np.sort(np.unique(np.concatenate(unq_dates)))

listings_df = listings_df[listings_df['drop_indicator'] == 0]

late_appearance_df = listings_df.loc[((listings_df['first_appearance']== 1) & (listings_df['month'] >= 51))]
    
late_appearance_list = late_appearance_df['id'].tolist()

all_ids = [x for x in all_ids if x not in late_appearance_list]

all_ids = np.array(all_ids)
# See if the data cleaning dropped any of the selected listings:
mask = np.isin(all_ids, listings_df['id'].unique())
all_ids = all_ids[mask] # Makes sure the ID is in the listings dataframe

In [33]:
len(all_ids)

8300

# Primary chunked processing and CSV creation

In [35]:
def process(init=0,stop=100):
    """
    This function produces and saves a final booked_df for ids within the range
    init and stop in the all_ids array.
    """
    warnings.simplefilter("ignore", category=RuntimeWarning)
    
    unique_ids = all_ids[init:stop]# Final run should not have a restriction
    unique_dates = all_dates # Here I set the limit on my dates. (Original 0 :688)

    # Save the number of ids and dates

    N_IDS = len(unique_ids) # I am using all the listing ids that I can right now.
    N_DATES = len(unique_dates) # I am using only dates up to the end of FEB 2020
    key_list = []

    for an_id in unique_ids:
        for a_date in unique_dates:
            key_list.append(an_id.astype(str) + ":" + a_date)
    key_df = pd.DataFrame(key_list, columns=['key'])
    def cal_file(df):
        return df[(df['date'].isin(unique_dates)) & (df['id'].isin(unique_ids))].sort_values(by=['id','date']).reset_index(drop = True)
    calendar_scrapes = []

    for my_cal in my_cals:
        calendar_scrapes.append(cal_file(my_cal))

    N_CALS = len(calendar_scrapes)
    i = 1
    for cal_files in calendar_scrapes:
        cal_files.loc[:, 'key'] = cal_files['id'].astype(str).str.strip() + ":" + cal_files['date'].str.strip()
        cal_files.columns = ['id' + str(i), 'date' + str(i), 'avail' + str(i), 'night_price' + str(i),
                        'key']   
        i+=1
    cal_dfs = [key_df]

    for i in range(N_CALS):
        count = i + 1 
        cal_dfs.append(calendar_scrapes[i][[('avail' + str(count)), 'key', ('night_price' + str(count))]])
    

    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['key'],
                                            how='outer'), cal_dfs).fillna(np.nan) 

    df_merged.loc[:, "id"], df_merged.loc[:, "date"] = (df_merged['key'].str.split(":", expand=True)[0].astype(int),
                                              df_merged['key'].str.split(":", expand=True)[1])
    df_merged = df_merged.drop(columns=['key'])

    df_merged = df_merged.drop_duplicates()
    avail_cols = df_merged.columns[df_merged.columns.str.contains('avail')]
    avail_cols = np.append(avail_cols, ('id','date'))

    avail_arr = df_merged[avail_cols].values

    avail_matrix = np.reshape(avail_arr[:,0:N_CALS], (int(N_DATES), N_IDS, N_CALS), order='F') 

    avail_matrix = np.where(avail_matrix == 't', 1., avail_matrix) 
    avail_matrix = np.where(avail_matrix == 'f', 0., avail_matrix)
    avail_change = np.diff(avail_matrix, axis=2)
    case_1 = (np.nansum(avail_change, axis=2) == -1)*1 
    def last_nonzero(arr, axis, invalid_val=-1):
        """
        This function finds the last non-zero value of an array.
        """
        mask = arr!= "nan"
        val = arr.shape[axis] - np.flip(mask,axis=axis).argmax(axis=axis) - 1
        return np.where(mask.any(axis=axis), val, invalid_val)
    str_avail = avail_matrix.astype(str)
    inds = last_nonzero(str_avail, axis=2, invalid_val=np.nan)

    test_inds = inds
    test_inds = np.reshape(test_inds, (N_DATES*N_IDS), order='F')
    np.reshape(inds[0:10, 50:55],(10*5), order='F')
    case2_df = pd.DataFrame(test_inds)
    case2_df.columns = ['cal_index']
    case2_df['cal_index'] = case2_df['cal_index'].astype('Int64')
    case2_df['cal_index'] = case2_df['cal_index'].fillna(0) 
    my_vals = case2_df['cal_index'].values 

    reshape_avail = np.reshape(avail_matrix, (N_DATES*N_IDS*N_CALS,), order='F')

    case2_df['id_index'] = (case2_df.index/N_DATES).astype(int)
    case2_df['date_index'] = case2_df.index - N_DATES*case2_df['id_index']

    indices = (case2_df['cal_index']*N_DATES*N_IDS + case2_df['id_index']*N_DATES + case2_df['date_index']).values

    case2_result = reshape_avail[indices.astype(int)]
    case2_result = np.reshape(case2_result, (N_DATES, N_IDS), order='F').astype(float)
    cond_21 = (np.nansum(avail_change, axis=2) == 0)*1
    cond_22 = (case2_result == 0)*1
    cond_23 = (np.nanmax(avail_matrix,axis=2) == 1)*1
    case_2 = cond_21*cond_22*cond_23
    booked_mat = np.maximum(case_1, case_2)
    booked_mat = np.reshape(booked_mat, (N_DATES*N_IDS,1), order='F')

    booked_df = pd.DataFrame(booked_mat >= 1 , columns=['booked'])
    booked_df.loc[:, "id"] = df_merged['id'].values
    booked_df.loc[:, "res_date"] = df_merged['date'].values

    never_avail = np.nanmax(avail_matrix,axis=2) != 1.
    booked_df.loc[:, 'never_avail'] = np.reshape(never_avail, (N_DATES*N_IDS, 1), order='F')
    price_cols = df_merged.columns[df_merged.columns.str.contains('night_price')]
    price_cols = np.append(price_cols, ('id','date'))

    price_arr = df_merged[price_cols].values

    price_matrix = np.reshape(price_arr[:,0:N_CALS], (int(N_DATES), N_IDS, N_CALS), order='F')
    last_avail_flag = (np.roll(avail_change,0, axis=2) == -1)*1
    last_avail_flag = np.where(last_avail_flag == 0, np.nan, last_avail_flag)

    booked_price = np.nanmax(price_matrix[:,:,0:(N_CALS - 1)]*last_avail_flag,axis=2)
    booked_df.loc[:, 'price'] = np.reshape(booked_price, (N_DATES*N_IDS,1), order='F')
    last_date_avail = np.isfinite(last_avail_flag).argmax(2)
    index_for_last = np.reshape(last_date_avail, (N_DATES*N_IDS,), order='F')
    last_date = np.array(init_dates)[np.reshape(last_date_avail, (N_DATES*N_IDS,), order='F')]

    # This ensures that the last date is only reported whenever the listing is identified as being booked.
    booked_df.loc[:, "last_date"] = last_date
    mask = booked_df.booked == False
    column_name = 'last_date'
    booked_df.loc[mask, column_name] = np.nan
    price_matrix_np = np.float64(price_matrix)
    def highest_index(a):
        try:
            return a[~np.isnan(a)][-1]
        except: 
            return 99999
    last_observed_price = np.apply_along_axis(highest_index, 2, price_matrix_np)
    last_observed_price = np.where(last_observed_price == 99999, np.nan, last_observed_price)
    last_observed_price = np.reshape(last_observed_price, (N_IDS*N_DATES), order='F')
    booked_df = booked_df.sort_values(by=['id', 'res_date'])
    booked_df.loc[:, "all_prices"] =  last_observed_price
    booked_ind = (booked_df['booked']*1).values

    final_prices = ((booked_ind)*booked_df['price'].fillna(0) + (1 - booked_ind)*booked_df['all_prices'].fillna(0)).values
    booked_df.loc[:, 'final_prices'] = final_prices
    booked_df['final_prices'].replace({0.:np.nan}, inplace=True)
    booked_df.loc[:, "seen_avail"] = (booked_df['never_avail'] == False).values*1
    booked_df = booked_df.sort_values(by='res_date').reset_index(drop=True)    
    booked_df.loc[:, 'week_yr'] =  booked_df['res_date'].astype('datetime64').dt.strftime('%Y-%U')
    booked_df.loc[:, 'mo_yr'] = booked_df['res_date'].astype('datetime64').dt.to_period('M')
    booked_df_save = booked_df.copy()
    booked_df_save.loc[:, "mo_yr"] = booked_df_save["mo_yr"].astype(str)
    
    #Saving
    os.chdir(data_path)
    save_name = 'booked_df_v2' + "_" + str(j) + '.csv'
    
    booked_df_save.to_csv(save_name)
    
    #Comment out the above line and uncomment the line below to save chunks 
    #as compressed gzip files.This comes at the cost of process speed but saves disk space. 
    
    #booked_df_save.to_csv(save_name, compression = 'gzip')
    
    print('chunk '+ str(j) + " completed")

# Processing function calls and iteration

In [36]:
#Iteration loop to prevent kernel crash
j=0
for i in range(0,82):
    process(init=i*100, stop=(i*100)+100)
    j+=1

chunk 0 completed
chunk 1 completed
chunk 2 completed
chunk 3 completed
chunk 4 completed
chunk 5 completed
chunk 6 completed
chunk 7 completed
chunk 8 completed
chunk 9 completed
chunk 10 completed
chunk 11 completed
chunk 12 completed
chunk 13 completed
chunk 14 completed
chunk 15 completed
chunk 16 completed
chunk 17 completed
chunk 18 completed
chunk 19 completed
chunk 20 completed
chunk 21 completed
chunk 22 completed
chunk 23 completed
chunk 24 completed
chunk 25 completed
chunk 26 completed
chunk 27 completed
chunk 28 completed
chunk 29 completed
chunk 30 completed
chunk 31 completed
chunk 32 completed
chunk 33 completed
chunk 34 completed
chunk 35 completed
chunk 36 completed
chunk 37 completed
chunk 38 completed
chunk 39 completed
chunk 40 completed
chunk 41 completed
chunk 42 completed
chunk 43 completed
chunk 44 completed
chunk 45 completed
chunk 46 completed
chunk 47 completed
chunk 48 completed
chunk 49 completed
chunk 50 completed
chunk 51 completed
chunk 52 completed
chu