# Calendar Integration partB

This Jupyter notebook contains the second half of calendar integration and it produces a dataframe named "cal_rev_list_FULLMERGEv3_10_days_before.csv" which is then saved in the Saved Data folder. 

# Import and Universal Directory Setup

In [1]:
import os
import pandas as pd
import numpy as np
import datetime
from functools import reduce
import gc

In [2]:
# Enable garbage collection module for memory purposes
gc.enable()

In [3]:
# Select city to work with

city_folder = 'united-states_portland/'
city_abbrev = 'POR'

In [4]:
# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Set paths
graphics_folder = cwd2 + '/3. Graphics/'
data_path = cwd2 + '/Saved data/'
csv_path = cwd2 + '/0. Raw data/' + city_folder
# Revert to preliminary directory
os.chdir(cwd1)

# Date Setup

I make an assumption that a property must have been 'active' tend days before it recieved a review. This assumption can be tweaked.

In [5]:
# Set how many days before a review a property is assumed to be active. 
N_days_before = 10

In [6]:
csv_path

'/Users/Lauri/github/CleanAirbnb/0. Raw data/united-states_portland/'

In [7]:
calFiles = []
fileNames = os.listdir(csv_path)
for fileNames in fileNames:
    if fileNames.endswith("calendar.csv.gz"):
        calFiles.append(fileNames)      

file_dates = []

for i in range(len(calFiles)):
    file_dates.append(calFiles[i].split('_')[2])
    
file_dates = np.sort(np.array(file_dates).astype('datetime64[M]'))

print(file_dates)

['2015-03' '2015-05' '2015-09' '2015-11' '2015-12' '2016-01' '2016-02'
 '2016-04' '2016-05' '2016-06' '2016-07' '2016-08' '2016-09' '2016-11'
 '2016-12' '2017-01' '2017-02' '2017-03' '2017-04' '2017-05' '2017-06'
 '2017-07' '2017-08' '2017-09' '2017-10' '2017-11' '2017-12' '2018-01'
 '2018-02' '2018-04' '2018-05' '2018-07' '2018-08' '2018-09' '2018-10'
 '2018-11' '2018-12' '2019-01' '2019-02']


array(['2015-03', '2015-05', '2015-09', '2015-11', '2015-12', '2016-01',
       '2016-02', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08',
       '2016-09', '2016-11', '2016-12', '2017-01', '2017-02', '2017-03',
       '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09',
       '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-04',
       '2018-05', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11',
       '2018-12', '2019-01', '2019-02'], dtype='datetime64[M]')

In [8]:
# init_dates = file_dates[3:26]

# print(init_dates)

# Upload listings dataframe/cross sectional file

In [9]:
os.chdir(data_path)
dateCols =['last_scraped', 'host_since', 'first_review', 'last_review']

listings_df = pd.read_csv(city_abbrev + '_1stStageClean.csv.gz', compression = 'gzip', 
                          low_memory=False, parse_dates=dateCols)

clean_months = listings_df['scrape_batch'].astype('datetime64[M]').unique().astype('datetime64[M]')

In [12]:
np.array(file_dates)[np.isin(file_dates, clean_months)]

array(['2015-09', '2015-11', '2015-12', '2016-01', '2016-02', '2016-04',
       '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-11',
       '2016-12', '2017-01', '2017-02', '2017-03', '2017-04', '2017-05',
       '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11',
       '2017-12', '2018-01', '2018-02', '2018-04', '2018-05', '2018-07',
       '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01',
       '2019-02'], dtype='datetime64[M]')

In [13]:
# Add free parking and pool amenities
parking_search = ['Free parking on premises']
listings_df.loc[:, 'free_park'] = (listings_df['amenities'].str.contains('|'.join(parking_search), na=False)*1).values

pool_search = ['pool', 'Pool']
listings_df.loc[:, 'pool'] = (listings_df['amenities'].str.contains('|'.join(pool_search), na=False)*1).values

os.chdir(csv_path)

In [14]:
# Loading the booked_df that is created by previous Jupyter Notebook
os.chdir(data_path)
booked_df = pd.read_csv('rejoined_booked_df.csv', low_memory=False)
booked_df['res_date'] = booked_df['res_date'].astype('datetime64[D]')

In [15]:
booked_df.head()

Unnamed: 0,booked,id,res_date,never_avail,price,last_date,all_prices,final_prices,seen_avail,week_yr,mo_yr
0,False,9356,2015-09-02,False,,,70.0,70.0,1,2015-35,2015-09
1,False,45890,2015-09-02,True,,,,,0,2015-35,2015-09
2,False,47326,2015-09-02,True,,,,,0,2015-35,2015-09
3,False,222209,2015-09-02,False,,,118.0,118.0,1,2015-35,2015-09
4,False,47674,2015-09-02,True,,,,,0,2015-35,2015-09


----

# Incorporating review files

If a property receives a review then it is assumed that it was available for a certain number of days prior to that review.

The commented out code in this section creates the review dataframe as long as you have all of the review scrapes in the "csv_path" folder. After it has been created, all that needs to be done is to load it in the next section.

### a) Collect review csv names

In [16]:
# Collect reviews csvs names

os.chdir(csv_path)
revFiles = []
fileNames = os.listdir(csv_path)
for fileNames in fileNames:
    if fileNames.endswith("reviews.csv.gz"):
        revFiles.append(fileNames)
        
revFiles = np.sort(revFiles)    

### b) Create a function to concatenate review dataframes

In [17]:
def concat_spreadsheets(possible_files, eligible_files):
    sheets_df = []

    for filename in revFiles[np.isin(possible_files, possible_files)]:
        df = pd.read_csv(filename, index_col = None, header=0)
        sheets_df.append(df)

    sheets_df = pd.concat(sheets_df, axis=0, ignore_index=True)
    return sheets_df

### c) Run the function and save a compressed dataframe with all unique reviews

In [19]:
# DO NOT DELETE, THIS CREATES THE AGGREGATE REVIEW FILE

collected_revs = concat_spreadsheets(file_dates, clean_months)
uniq_revs = collected_revs.drop_duplicates()
print(len(collected_revs), len(uniq_revs))
uniq_revs.to_csv(city_abbrev + '_Revs.csv.gz', compression='gzip', index=False) 

# ============================================================

# This cell imports the compressed review datafame created above. 

#os.chdir(csv_path)

#uniq_revs = pd.read_csv(city_abbrev + '_Revs.csv.gz', compression='gzip')

#os.chdir(data_path)

7090339 497526


------

In [20]:
# Drop the listing IDs that are not in the calendar files
unique_ids = booked_df.id.unique()
uniq_revs = uniq_revs[uniq_revs.listing_id.isin(unique_ids)]

------

In [21]:
res_date_min, res_date_max = booked_df['res_date'].min(), booked_df['res_date'].max() 
revs_df = uniq_revs[['listing_id', 'date']]

revs_df.loc[:, 'date'] = revs_df['date'].astype('datetime64[D]')
mask = (revs_df['date'] >= res_date_min) & (revs_df['date'] <= res_date_max)
revs_df = revs_df[mask]
revs_short = revs_df 

# Dataframe of res_dates
reviews_file = revs_short
N_reviews = len(reviews_file)

reviews_file.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,listing_id,date
93710,6541045,2015-09-02
93937,6290791,2015-09-02
94091,2660141,2015-09-02
95435,6361965,2015-09-02
97143,6985337,2015-09-02


# Active property dataframe creation

Much like the review code, the commented out block below creates a pandas dataframe that tracks the dates during which a property is active.

Once the dataframe is created, it can just be loaded in, meaning you can comment out the code below.

In [22]:
# Here we identify the unique dates from the reviews_file and identify the 
# index for every row of the reviews_file that maps to the unique dates file. 
# This allows one to avoid duplicate calculations for identical dates.


my_dates = reviews_file['date'].astype('datetime64[D]')

unq_dates = pd.DataFrame([my_dates.unique()]).T
unq_dates.columns = ['unq_dates']
unq_dates = unq_dates.sort_values(by='unq_dates').reset_index(drop=True)

# # Identify the valid index in the unq_arr for the non-unique list of dates

dates_arr = my_dates.values
unq_arr = np.concatenate(unq_dates.values)

sorter = np.argsort(unq_arr)
my_indices = sorter[np.searchsorted(unq_arr, dates_arr, sorter=sorter)]

my_dates = pd.DataFrame(my_dates.values, columns = ['dates'])
my_dates.loc[:, "unq_ind"] = my_indices
unq_dates.loc[np.r_[my_dates['unq_ind']], :]

# # Here we create all of the date ranges that we want to work with.

unq_dates.loc[:,"start"] = unq_dates['unq_dates'] - datetime.timedelta(days=N_days_before)

days_before = []

for i in range(len(unq_dates)):
      days_before.append(pd.date_range(unq_dates['start'][i], unq_dates['unq_dates'][i], closed='left'))
    
# # Here we create a dataframe that has all of the listing data

active_dates = np.concatenate(np.array(days_before)[my_dates['unq_ind'].values].astype('datetime64[D]'))
my_date_ids = np.reshape(pd.concat([reviews_file['listing_id']]*N_days_before, axis=1).values, (N_days_before*N_reviews,1))
my_date_ids = np.concatenate(my_date_ids)

active_df = pd.DataFrame([my_date_ids, active_dates]).T
active_df.columns = ['id', 'date']

active_day_int = active_df['date'].view(np.int64)//(10**3)
active_df.loc[:, "dy_key"] = active_day_int + active_df['id']
del active_day_int

# active_df.loc[:, "key"] = active_df['id'].astype(str) + ":" + active_df['date'].astype(str)

print(N_reviews)

active_df.to_csv('active_dates'+ str(N_days_before) + ".csv.gz", compression = 'gzip', index=False)

395637


In [31]:
booked_mo_yr_int64 = booked_df['mo_yr'].astype('datetime64').view(np.int64)//(10**3)
booked_df.loc[:, "mo_key"] = booked_mo_yr_int64 + booked_df['id']
del booked_mo_yr_int64

In [33]:
# my_active_file is created by the code that is commented out below! 
# [Note: Need to be in the 0. Raw data directory for this to work.]

#my_active_file = "active_dates" + str(N_days_before) + ".csv.gz"

#active_df = pd.read_csv(my_active_file, compression='gzip', parse_dates=['date'])
#active_df.tail(5)

# Pre-merge processing

## Create merge key in the booked_df
* Need keys for date and month

In [51]:
# booked_df['id_str']= booked_df['id'].astype(str)
# booked_df.loc[:, "key"] = booked_df['id_str'] + ":" + booked_df['res_date'].astype(str)

In [52]:
booked_df_day_int = booked_df['res_date'].astype('datetime64[ns]').values.astype(np.int64)//(10**3)
booked_df.loc[:, 'dy_key'] = booked_df_day_int + booked_df['id']

del booked_df_day_int

In [53]:
# booked_df.loc[:, 'mo_yr'] = booked_df['res_date'].dt.to_period('M')
# booked_df.loc[:, "id_mo_key"] = booked_df['mo_yr'].astype(str) + ":" + booked_df['id_str']
# booked_df['id_mo_key'].nunique()

In [62]:
booked_mo_yr_int64 = booked_df['mo_yr'].dt.to_timestamp().values.astype(np.int64)//(10**3)
booked_df.loc[:, "mo_key"] = booked_mo_yr_int64 + booked_df['id']
del booked_mo_yr_int64

In [55]:
# # Compare keys quickly

# booked_df['id_mo_key'].nunique(), booked_df['mo_key'].nunique()

## Create merge key in the listings_df

In [56]:
listings_df['batch_YRMO'] = listings_df['batch_YRMO'].astype('datetime64').dt.to_period('M')

listings_mask = ((listings_df['batch_YRMO'] >= booked_df['mo_yr'].min()) & 
                 (listings_df['batch_YRMO'] <= booked_df['mo_yr'].max()))

listings_df = listings_df[listings_mask]

In [66]:
listings_df_mo_yr_int64 = listings_df['batch_YRMO'].dt.to_timestamp().values.astype(np.int64)//(10**3)
listings_df.loc[:, "mo_key"] = listings_df_mo_yr_int64 + listings_df['id']

del listings_df_mo_yr_int64

In [67]:
# listings_df.loc[:, "id_mo_key"] = listings_df['batch_YRMO'].astype(str) + ":" + listings_df['id'].astype(str)
# # Compare keys quickly
# listings_df['id_mo_key'].nunique(), listings_df['mo_key'].nunique()

# Merge

In [68]:
# Rename prices to make it clearer
booked_df = booked_df.rename(columns={'final_prices':"calendar_price"})
listings_df = listings_df.rename(columns={'price':"headline_price"})

In [69]:
# Merge time-varying variables from the panel to the booked_df dataframe. Prices here are "headline prices".
list_month_merge = booked_df.merge(listings_df[['mo_key', 'List_month', 'Listlead1', 
                                                'Listlag1', 'headline_price', 'cleaning_fee', 'host_listings_count', 'cum_sum',
                                                 'bedrooms', 'room_type', 'neighbourhood', 'zipcode', # I wanted to have this last row separte 
                                               'free_park', 'pool', 'host_since', 'review_scores_rating', 'number_of_reviews']], # These two are new here
                                   left_on='mo_key', right_on='mo_key', how='left')

In [70]:
# Clear some memory
del booked_df
del listings_df
del revs_df
del listings_mask
del revs_short

In [71]:
print(len(list_month_merge))
list_month_merge.head(5)

12972400


Unnamed: 0,booked,id,res_date,never_avail,price,last_date,all_prices,calendar_price,seen_avail,week_yr,...,cum_sum,bedrooms,room_type,neighbourhood,zipcode,free_park,pool,host_since,review_scores_rating,number_of_reviews
0,False,9356,2015-09-02,False,,,70.0,70.0,1,2015-35,...,1.0,2.0,Entire home/apt,Woodlawn,97211.0,0.0,0.0,2009-08-06,97.0,57.0
1,False,45890,2015-09-02,True,,,,,0,2015-35,...,2.0,1.0,Entire home/apt,Roseway,97213.0,0.0,0.0,2010-07-24,97.0,92.0
2,False,47326,2015-09-02,True,,,,,0,2015-35,...,2.0,1.0,Private room,Overlook,97217.0,0.0,0.0,2010-08-26,89.0,221.0
3,False,222209,2015-09-02,False,,,118.0,118.0,1,2015-35,...,3.0,1.0,Private room,Forest Park,97229.0,0.0,0.0,2011-07-29,92.0,20.0
4,False,47674,2015-09-02,True,,,,,0,2015-35,...,1.0,1.0,Entire home/apt,Buckman,97214.0,0.0,0.0,2010-05-07,,


In [72]:
print(active_df.head(10))

Unnamed: 0,id,date,dy_key
0,6541045,2015-08-23,1440288006541045
1,6541045,2015-08-24,1440374406541045
2,6541045,2015-08-25,1440460806541045
3,6541045,2015-08-26,1440547206541045
4,6541045,2015-08-27,1440633606541045
...,...,...,...
3956365,29099378,2019-01-08,1546905629099378
3956366,29099378,2019-01-09,1546992029099378
3956367,29099378,2019-01-10,1547078429099378
3956368,29099378,2019-01-11,1547164829099378


In [76]:
full_merge = pd.DataFrame()

for i in range (0,556):
    active_df_small = active_df[10000*i:10000*i+10000]
    list_month_merge_small = list_month_merge.loc[list_month_merge['dy_key'].isin(active_df_small['dy_key'])]
    partial_merge = list_month_merge_small.merge(active_df_small[['date', 'dy_key']], left_on='dy_key', right_on='dy_key', how='left')
    partial_merge = partial_merge.drop(columns=['dy_key', 'mo_key'])
    full_merge = full_merge.append(partial_merge)
#     print("chunk " + str(i) + " completed") # For updates on when chunk is complete uncomment this line

chunk 0 completed
chunk 1 completed
chunk 2 completed
chunk 3 completed
chunk 4 completed
chunk 5 completed
chunk 6 completed
chunk 7 completed
chunk 8 completed
chunk 9 completed
chunk 10 completed
chunk 11 completed
chunk 12 completed
chunk 13 completed
chunk 14 completed
chunk 15 completed
chunk 16 completed
chunk 17 completed
chunk 18 completed
chunk 19 completed
chunk 20 completed
chunk 21 completed
chunk 22 completed
chunk 23 completed
chunk 24 completed
chunk 25 completed
chunk 26 completed
chunk 27 completed
chunk 28 completed
chunk 29 completed
chunk 30 completed
chunk 31 completed
chunk 32 completed
chunk 33 completed
chunk 34 completed
chunk 35 completed
chunk 36 completed
chunk 37 completed
chunk 38 completed
chunk 39 completed
chunk 40 completed
chunk 41 completed
chunk 42 completed
chunk 43 completed
chunk 44 completed
chunk 45 completed
chunk 46 completed
chunk 47 completed
chunk 48 completed
chunk 49 completed
chunk 50 completed
chunk 51 completed
chunk 52 completed
chu

chunk 416 completed
chunk 417 completed
chunk 418 completed
chunk 419 completed
chunk 420 completed
chunk 421 completed
chunk 422 completed
chunk 423 completed
chunk 424 completed
chunk 425 completed
chunk 426 completed
chunk 427 completed
chunk 428 completed
chunk 429 completed
chunk 430 completed
chunk 431 completed
chunk 432 completed
chunk 433 completed
chunk 434 completed
chunk 435 completed
chunk 436 completed
chunk 437 completed
chunk 438 completed
chunk 439 completed
chunk 440 completed
chunk 441 completed
chunk 442 completed
chunk 443 completed
chunk 444 completed
chunk 445 completed
chunk 446 completed
chunk 447 completed
chunk 448 completed
chunk 449 completed
chunk 450 completed
chunk 451 completed
chunk 452 completed
chunk 453 completed
chunk 454 completed
chunk 455 completed
chunk 456 completed
chunk 457 completed
chunk 458 completed
chunk 459 completed
chunk 460 completed
chunk 461 completed
chunk 462 completed
chunk 463 completed
chunk 464 completed
chunk 465 completed


In [99]:
len(full_merge), (full_merge['res_date'] == full_merge['date']).sum()

(2095874, 2095874)

In [78]:
full_merge.shape

(3943187, 31)

In [79]:
full_merge = full_merge.drop_duplicates()
full_merge = full_merge.reset_index(drop=True)

In [80]:
full_merge.shape

(2095874, 31)

In [82]:
full_merge.columns

Index(['booked', 'id', 'res_date', 'never_avail', 'price', 'last_date',
       'all_prices', 'calendar_price', 'seen_avail', 'week_yr', 'mo_yr',
       'id_str', 'key', 'id_mo_key', 'List_month', 'Listlead1', 'Listlag1',
       'headline_price', 'cleaning_fee', 'host_listings_count', 'cum_sum',
       'bedrooms', 'room_type', 'neighbourhood', 'zipcode', 'free_park',
       'pool', 'host_since', 'review_scores_rating', 'number_of_reviews',
       'date'],
      dtype='object')

In [90]:
# Compare price measures that are null
(~full_merge['price'].isna()).sum(), (~full_merge['all_prices'].isna()).sum(), (~full_merge['calendar_price'].isna()).sum()

(947915, 1753403, 1745688)

In [100]:
full_merge = full_merge.drop(columns=['price', 'all_prices', 'date'])
print(full_merge.columns)

KeyError: "['price' 'all_prices'] not found in axis"

In [102]:
full_merge['rev_active'] = 1 - full_merge['rev_date'].isna()*1
full_merge['composite_active'] = full_merge[['seen_avail', 'rev_active']].values.max(1)

KeyError: 'rev_date'

In [33]:
full_merge_tosave = full_merge.copy()
full_merge_tosave = full_merge_tosave.drop_duplicates()

print(full_merge_tosave.head(10))
print(full_merge.shape)

       id    res_date  booked  seen_avail  calendar_price  late_date  id_str  \
0   53940  2015-09-02       0           1            77.0 2015-09-02   53940   
1   53940  2015-09-06       0           1            86.0 2015-09-06   53940   
2   53940  2015-09-07       0           1            80.0 2015-09-07   53940   
3   53940  2015-09-08       0           1            75.0 2015-09-08   53940   
4   53940  2015-09-09       0           1            75.0 2015-09-09   53940   
5  115681  2015-09-12       0           1            88.0 2015-09-12  115681   
6   53940  2015-09-13       0           1            75.0 2015-09-13   53940   
7   53940  2015-09-14       0           1            75.0 2015-09-14   53940   
8   53940  2015-09-15       0           1            75.0 2015-09-15   53940   
9   53940  2015-09-16       0           1            75.0 2015-09-16   53940   

     mo_yr  List_month  Listlead1  ...  neighbourhood  zipcode  free_park  \
0  2015-09         1.0        1.0  ...    

# Saving 

In [34]:
os.chdir(data_path)

In [36]:
full_merge_tosave.to_csv("Portland_cal_rev_list_FULLMERGEv4_10_days_before.csv.gz", compression='gzip', index=False)