# First version created: October 17, 2019

This Notebook concatenates Airbnb listing files from http://insideairbnb.com/ and creates both wide form and long form aggregate datasets.

# 0. Set-up

In [1]:
# Importing Python libraries

import os
import csv
import numpy as np
import pandas as pd
import gzip

In [2]:
# Store preliminary directory, use of os should make this compatible for any user with access to the repository
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Make sure repository has a 0. Raw data folder!
data_dir = cwd2 + '/0. Raw data'

# Revert to preliminary directory
os.chdir(cwd1)

In [3]:
# This function converts values into an integer, if it fails return a string.

def IntorStr(value):
    try:
        return int(value)
    except:
        return str(value)

# 1. Collecting listings.csv.gz files

In [4]:
# Collect the listings CSVs

numFiles = []
fileNames = os.listdir(data_dir)
for fileNames in fileNames:
    if fileNames.endswith("listings.csv.gz"):
        numFiles.append(fileNames)
    
city = numFiles[0].split("_")[1]
print(city)

# Count the number of files
numFiles = np.sort(numFiles)
print(len(numFiles))

# Take a look at the first 10 listing files
print(numFiles[0:10])

san-francisco
59
['united-states_san-francisco_2015-05-04_listings.csv.gz'
 'united-states_san-francisco_2015-09-02_listings.csv.gz'
 'united-states_san-francisco_2015-11-01_listings.csv.gz'
 'united-states_san-francisco_2015-12-02_listings.csv.gz'
 'united-states_san-francisco_2016-02-02_listings.csv.gz'
 'united-states_san-francisco_2016-04-03_listings.csv.gz'
 'united-states_san-francisco_2016-05-02_listings.csv.gz'
 'united-states_san-francisco_2016-06-02_listings.csv.gz'
 'united-states_san-francisco_2016-07-02_listings.csv.gz'
 'united-states_san-francisco_2016-08-02_listings.csv.gz']


## 1 (a). Check if a file is missing specific columns
The loop below accepts a list of data file names and a list of column names and then prints if a file is missing a particular variable.

In [5]:
# First take a look at columns in a file

for my_file in numFiles:
    data = pd.read_csv(data_dir + '/' + my_file)
    data_columns = list(data.columns)
    print("N columns: " + str(len(data_columns)))
    print(data_columns)
    
    print("---------------------")

N columns: 68
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_picture_url', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_sc

N columns: 92
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleani

  interactivity=interactivity, compiler=compiler, result=result)


N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

  interactivity=interactivity, compiler=compiler, result=result)


N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

In [6]:
def check_data_cols(files, variables):
    
    """ 
    This function accepts a list of data file names (strings) 
    and a list of column names (strings) and then prints if 
    a file is missing a particular variable.
    """
    
    for my_file in files:
        data = pd.read_csv(data_dir + '/' + my_file)
        data_columns = list(data.columns)
        
        for my_column in variables:
            if my_column not in data_columns:
                print(my_column + " missing from:")
                print(my_file)

        print("----------------------------------")

In [7]:
candidate_columns = ['amenities', 'instant_bookable',
           'host_listings_count', 'cleaning_fee', 
           'zipcode','days_since_rev']

check_data_cols(numFiles, candidate_columns)

amenities missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
instant_bookable missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
host_listings_count missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
cleaning_fee missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
days_since_rev missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-09-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-11-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-12-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-02-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-04-03_l

  if (await self.run_code(code, result,  async_=asy)):


days_since_rev missing from:
united-states_san-francisco_2016-06-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-07-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-08-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-09-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-10-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-11-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-12-03_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2017-01-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-f

  if (await self.run_code(code, result,  async_=asy)):


days_since_rev missing from:
united-states_san-francisco_2019-11-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2019-12-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-01-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-01-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-02-12_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-03-13_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-04-07_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-05-06_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-f

In [8]:
# Poor documentation here but I think this could be adapted to a function that
# ensures that we are focusing on columns that exist in every data file.

# FOR HARRISON: PLEASE MAKE THIS INTO A FUNCTION SIMILAR TO check_data_cols above. 
# I don't like the 'problemchild' naming convention either.

# THIS LOOP IS PRETTY SLOW. I'M NOT SURE WE NEED TO KEEP IT.

problemchild = pd.read_csv(data_dir + '/'  + 'united-states_san-francisco_2015-05-04_listings.csv.gz')
a = set(problemchild.columns)

for i in numFiles:
    data = pd.read_csv(data_dir + '/'  + i)
    data_columns = set(data.columns)
    
    if i == numFiles[0]:
        compat = data_columns.intersection(a)
        
    compat = compat.intersection(data_columns)
    
compat = list(compat)

In [9]:
# Print out which columns are in the compatible set

for i in ['amenities', 'instant_bookable','host_listings_count', 'cleaning_fee', 'zipcode']:
    if i in compat:
        print("Yes")
    else:
        print("NO")

NO
NO
NO
NO
Yes


In [10]:
# Display files by year

years = np.linspace(2015,2020, 6)

for year in years:
    print(str(int(year)))
    for IND in range(len(numFiles)):
            if numFiles[IND].split('_')[2][0:4] == str(int(year)):
                print(numFiles[IND])
    print("--------------------------------------------")

2015
united-states_san-francisco_2015-05-04_listings.csv.gz
united-states_san-francisco_2015-09-02_listings.csv.gz
united-states_san-francisco_2015-11-01_listings.csv.gz
united-states_san-francisco_2015-12-02_listings.csv.gz
--------------------------------------------
2016
united-states_san-francisco_2016-02-02_listings.csv.gz
united-states_san-francisco_2016-04-03_listings.csv.gz
united-states_san-francisco_2016-05-02_listings.csv.gz
united-states_san-francisco_2016-06-02_listings.csv.gz
united-states_san-francisco_2016-07-02_listings.csv.gz
united-states_san-francisco_2016-08-02_listings.csv.gz
united-states_san-francisco_2016-09-02_listings.csv.gz
united-states_san-francisco_2016-10-01_listings.csv.gz
united-states_san-francisco_2016-11-02_listings.csv.gz
united-states_san-francisco_2016-12-03_listings.csv.gz
--------------------------------------------
2017
united-states_san-francisco_2017-01-01_listings.csv.gz
united-states_san-francisco_2017-02-02_listings.csv.gz
united-states_s

In [13]:
def concat_spreadsheets(START, END):
    
    """
    [FOR HARRISON: ADD SHORT NOTATION HERE ABOUT WHAT THIS FUNCTION DOES]
    """
    
    sheets_df = []
    
    for filename in numFiles[START:END]:
        df = pd.read_csv(data_dir + '/' + filename, index_col = None, header=0)
        sheets_df.append(df)
        
    sheets_df = pd.concat(sheets_df, axis=0, ignore_index=True)
    return sheets_df

In [14]:
sheet15 = concat_spreadsheets(1, 2) # 2015

In [12]:
# FOR HARRISON: Make sure yearly concatenates are correct here and that they are no
# longer hard-coded. You shouldn't have to manually enter all of the files
# 2016 is also missing....

# Save the concatenated sheets as separate files
sheet15 = concat_spreadsheets(1, 2) # 2015
sheet17 = concat_spreadsheets(2, 3) # 2017
sheet18 = concat_spreadsheets(3, 10) # 2018
sheet19 = concat_spreadsheets(10, 22) # 2019
sheet20 = concat_spreadsheets(22, len(numFiles)-2) # 2020

uniq_all = pd.concat([sheet15.id, sheet17.id, sheet18.id, sheet19.id, sheet20.id], axis=0, ignore_index=True).unique()

FileNotFoundError: [Errno 2] No such file or directory: 'united-states_san-francisco_2015-09-02_listings.csv.gz'

In [None]:
# Here I'm seeing if there is an issue with the DataFrame "sheet18"
sheet16[sheet16['id']==2134100][['id','host_name','host_since','host_id','last_scraped','price']]

# 1. Create wide form of full data

In [None]:
# HARRISON: Once the unique ids have been correctly concatenated then the wide form function should work.

In [None]:
def wide_form(UNIQ_IDS, START, END, METRICS): 
    
    """
    [This function needs additional documentation here.]
    
    """
    
    listing_df = pd.DataFrame(UNIQ_IDS)
    listing_df.columns = ['id']
    
    print("Number of unique listings: " + str(len(listing_df)))
    
    output_df = listing_df.copy()    
    
    date_count = START
    for i in numFiles[START:END]:
 
        file = open(os.path.join(raw_data_path, i))
        reader = csv.reader(file, delimiter=',')
        headers = next(reader)

        bnb_metrics = METRICS
        
        # Debugging print(headers)
        # Debugging print(type(headers))

        d={}
        for j in bnb_metrics:
            d[str(j)+"_index"] = headers.index(j)
            # Debugging print(str(j))
            # Debugging print(d[str(j)+"_index"])

        row_values = []

        for row in reader:
            value_i = []

            for j in bnb_metrics:
                value_j = IntorStr(row[d[str(j)+"_index"]])
                value_i.append(value_j)

            row_values.append(value_i)
        
        values_df = pd.DataFrame(row_values) # Create a dataframe for the row_values      
        values_df.columns = bnb_metrics # Set column titles
        values_df = values_df.drop_duplicates(subset='id', keep='last')

        # Merge the values with their respective id and drop duplicates
        merged_df = pd.merge(listing_df, values_df, how='outer', on='id')
        merged_df = merged_df.drop_duplicates(keep='first')   
        merged_df = merged_df.reset_index()

        for k in bnb_metrics[1:]: # I don't want this loop to include 'id', hence [1:] range is used
            output_df[k + str(date_count)] = merged_df[k]

        output_df['List_month'+str(date_count)] = listing_df['id'].isin(np.array(values_df['id']))*1 # See if the observation is in the month data  

        date_count += 1
        
    return output_df

In [None]:
# THIS SAVES WIDE FORMAT DATASET

wideALL = wide_form(uniq_all, 1, len(numFiles)-2, compat)
wideALL.to_csv(os.path.join(csv_save_path,r'Data_wideALL_v2.csv'), compression = 'gzip')

In [None]:
list(wideALL.columns)

In [None]:
x = set(wideALL.columns)
y = set(compat)
x.intersection(y)

In [None]:
compat.remove('id')

---------

# 2. Create long form of full data

In [None]:
def long_form(df, METRICS):
    bnb_metrics = METRICS
    mylist = ['List_month']
    mylist.extend(bnb_metrics[1:])
    long_df = pd.wide_to_long(df, stubnames=mylist, i='id', j='month')
    
    return long_df

In [None]:
#longALL = long_form(wideALL, compat)
#longALL.to_csv(os.path.join(csv_save_path,r'Data_longALL_v1.csv'))
longALL.to_csv(os.path.join(csv_save_path,r'Data_longALL_v3.csv.gz'), compression='gzip')

In [None]:
longALL.reset_index()['last_scraped'].unique()

# Appendix

In [None]:
airbnb_full_metrics = ['id', 'last_scraped', 'host_id', 'host_name', 'host_since', 'host_location', 
                       'host_response_time', 'host_response_rate', 'host_is_superhost', 'host_listings_count', 
                       'host_total_listings_count', 'neighbourhood','neighbourhood_cleansed', 'city', 'street', 'zipcode', 
                       'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates',
                       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price',
                       'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 
                       'minimum_nights', 'maximum_nights', 'calendar_updated', 'calendar_last_scraped', 
                       'has_availability', 'availability_30', 'availability_60', 'availability_90', 
                       'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating',
                       'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                       'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                       'requires_license', 'license', 'instant_bookable', 'cancellation_policy', 
                       'calculated_host_listings_count', 'reviews_per_month', 'amenities']
okchild = pd.read_csv('united-states_austin_2020-03-17_listings.csv')
airbnb_metrics = list(okchild.columns)
to_remove = ['host_total_listings_count', 'security_deposit', 'has_availability',  'cancellation_policy', 'access', 'interaction', 'house_rules'
            ,'minimum_minimum_nights', 'maximum_minimum_nights','minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm',
            'maximum_nights_avg_ntm', 'number_of_reviews_ltm','is_business_travel_ready','calculated_host_listings_count_entire_homes',
            'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','summary']
airbnb_metrics_compat = list(filter(lambda i: i not in to_remove, airbnb_metrics))#airbnb_full_metrics