# First version created: October 17, 2019

This Notebook concatenates Airbnb listing files from http://insideairbnb.com/ and creates both wide form and long form aggregate datasets.

# 0. Set-up

In [1]:
# Importing Python libraries

import os
import csv
import numpy as np
import pandas as pd
import gzip
from itertools import compress

In [2]:
# Store preliminary directory, use of os should make this compatible for any user with access to the repository
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Make sure repository has a 0. Raw data folder!
data_dir = cwd2 + '/0. Raw data'

# Revert to preliminary directory
os.chdir(cwd1)

In [3]:
# This function converts values into an integer, if it fails return a string.

def IntorStr(value):
    try:
        return int(value)
    except:
        return str(value)

# 1. Collecting listings.csv.gz files

In [4]:
# Collect the listings CSVs

numFiles = []
fileNames = os.listdir(data_dir)
for fileNames in fileNames:
    if fileNames.endswith("_listings.csv.gz"):
        numFiles.append(fileNames)
    
city = numFiles[0].split("_")[1]
print(city)

# Count the number of files
numFiles = np.sort(numFiles)
print(len(numFiles))

# Take a look at the first 10 listing files
print(numFiles[0:10])

san-francisco
59
['united-states_san-francisco_2015-05-04_listings.csv.gz'
 'united-states_san-francisco_2015-09-02_listings.csv.gz'
 'united-states_san-francisco_2015-11-01_listings.csv.gz'
 'united-states_san-francisco_2015-12-02_listings.csv.gz'
 'united-states_san-francisco_2016-02-02_listings.csv.gz'
 'united-states_san-francisco_2016-04-03_listings.csv.gz'
 'united-states_san-francisco_2016-05-02_listings.csv.gz'
 'united-states_san-francisco_2016-06-02_listings.csv.gz'
 'united-states_san-francisco_2016-07-02_listings.csv.gz'
 'united-states_san-francisco_2016-08-02_listings.csv.gz']


In [5]:
#look at full numFiles
print(numFiles)

['united-states_san-francisco_2015-05-04_listings.csv.gz'
 'united-states_san-francisco_2015-09-02_listings.csv.gz'
 'united-states_san-francisco_2015-11-01_listings.csv.gz'
 'united-states_san-francisco_2015-12-02_listings.csv.gz'
 'united-states_san-francisco_2016-02-02_listings.csv.gz'
 'united-states_san-francisco_2016-04-03_listings.csv.gz'
 'united-states_san-francisco_2016-05-02_listings.csv.gz'
 'united-states_san-francisco_2016-06-02_listings.csv.gz'
 'united-states_san-francisco_2016-07-02_listings.csv.gz'
 'united-states_san-francisco_2016-08-02_listings.csv.gz'
 'united-states_san-francisco_2016-09-02_listings.csv.gz'
 'united-states_san-francisco_2016-10-01_listings.csv.gz'
 'united-states_san-francisco_2016-11-02_listings.csv.gz'
 'united-states_san-francisco_2016-12-03_listings.csv.gz'
 'united-states_san-francisco_2017-01-01_listings.csv.gz'
 'united-states_san-francisco_2017-02-02_listings.csv.gz'
 'united-states_san-francisco_2017-03-02_listings.csv.gz'
 'united-state

## 1 (a). Check if a file is missing specific columns
The loop below accepts a list of data file names and a list of column names and then prints if a file is missing a particular variable.

In [6]:
# First take a look at columns in a file

for my_file in numFiles:
    data = pd.read_csv(data_dir + '/' + my_file, encoding= 'iso-8859-1')
    data_columns = list(data.columns)
    print("N columns: " + str(len(data_columns)))
    print(data_columns)
    
    print("---------------------")

N columns: 68
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_picture_url', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_sc

N columns: 92
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleani

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 95
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 96
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mon

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

N columns: 106
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'mo

In [7]:
def check_data_cols(files, variables):
    
    """ 
    This function accepts a list of data file names (strings) 
    and a list of column names (strings) and then prints if 
    a file is missing a particular variable.
    """
    
    for my_file in files:
        data = pd.read_csv(data_dir + '/' + my_file, encoding = 'iso-8859-1')
        data_columns = list(data.columns)
        
        for my_column in variables:
            if my_column not in data_columns:
                print(my_column + " missing from:")
                print(my_file)

        print("----------------------------------")

In [8]:
candidate_columns = ['amenities', 'instant_bookable',
           'host_listings_count', 'cleaning_fee', 
           'zipcode','days_since_rev','price', 'host_id']

check_data_cols(numFiles, candidate_columns)

amenities missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
instant_bookable missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
host_listings_count missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
cleaning_fee missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
days_since_rev missing from:
united-states_san-francisco_2015-05-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-09-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-11-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2015-12-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-02-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-04-03_l

  if (await self.run_code(code, result,  async_=asy)):


days_since_rev missing from:
united-states_san-francisco_2016-06-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-07-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-08-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-09-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-10-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-11-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2016-12-03_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2017-01-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-f

  if (await self.run_code(code, result,  async_=asy)):


days_since_rev missing from:
united-states_san-francisco_2019-11-01_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2019-12-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-01-02_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-01-04_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-02-12_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-03-13_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-04-07_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-francisco_2020-05-06_listings.csv.gz
----------------------------------
days_since_rev missing from:
united-states_san-f

In [9]:
# THIS LOOP IS PRETTY SLOW. I'M NOT SURE WE NEED TO KEEP IT.
def find_compatible_columns(most_missing_columns):
    
    """ 
    This function accepts a filename (string). The function then 
    returns a list of the column names that exist in all files
    in numFiles based on the column names of the passed filename.
    """
    
    a = set(most_missing_columns.columns)
    
    for i in numFiles:
        data = pd.read_csv(data_dir + '/'  + i, encoding = 'iso-8859-1')
        data_columns = set(data.columns)
    
        if i == numFiles[0]:
            compat = data_columns.intersection(a)
        
        compat = compat.intersection(data_columns)
    
    compat = list(compat)
    
    return compat

In [10]:
most_missing = pd.read_csv(data_dir + '/'  + 'united-states_san-francisco_2015-05-04_listings.csv.gz', encoding='iso-8859-1')
compatible = find_compatible_columns(most_missing)
compatible


['calculated_host_listings_count',
 'market',
 'guests_included',
 'longitude',
 'beds',
 'calendar_last_scraped',
 'accommodates',
 'bed_type',
 'price',
 'monthly_price',
 'zipcode',
 'minimum_nights',
 'review_scores_cleanliness',
 'requires_license',
 'maximum_nights',
 'scrape_id',
 'jurisdiction_names',
 'id',
 'neighbourhood_group_cleansed',
 'last_review',
 'picture_url',
 'square_feet',
 'host_id',
 'bathrooms',
 'extra_people',
 'review_scores_checkin',
 'listing_url',
 'calendar_updated',
 'availability_30',
 'neighbourhood_cleansed',
 'street',
 'license',
 'description',
 'host_since',
 'neighbourhood',
 'property_type',
 'city',
 'review_scores_communication',
 'latitude',
 'number_of_reviews',
 'host_name',
 'host_picture_url',
 'is_location_exact',
 'host_is_superhost',
 'host_url',
 'room_type',
 'availability_60',
 'availability_90',
 'review_scores_value',
 'review_scores_rating',
 'last_scraped',
 'review_scores_location',
 'state',
 'host_about',
 'name',
 'space',

In [11]:
# Print out which columns are in the compatible set

for i in ['amenities', 'instant_bookable','host_listings_count', 'cleaning_fee', 'zipcode', 'host_picture_url']:
    if i in compatible:
        print("Yes")
    else:
        print("NO")

NO
NO
NO
NO
Yes
Yes


In [12]:
# Display files by year

years = np.linspace(2015,2020, 6)

for year in years:
    print(str(int(year)))
    for IND in range(len(numFiles)):
            if numFiles[IND].split('_')[2][0:4] == str(int(year)):
                print(numFiles[IND])
    print("--------------------------------------------")

2015
united-states_san-francisco_2015-05-04_listings.csv.gz
united-states_san-francisco_2015-09-02_listings.csv.gz
united-states_san-francisco_2015-11-01_listings.csv.gz
united-states_san-francisco_2015-12-02_listings.csv.gz
--------------------------------------------
2016
united-states_san-francisco_2016-02-02_listings.csv.gz
united-states_san-francisco_2016-04-03_listings.csv.gz
united-states_san-francisco_2016-05-02_listings.csv.gz
united-states_san-francisco_2016-06-02_listings.csv.gz
united-states_san-francisco_2016-07-02_listings.csv.gz
united-states_san-francisco_2016-08-02_listings.csv.gz
united-states_san-francisco_2016-09-02_listings.csv.gz
united-states_san-francisco_2016-10-01_listings.csv.gz
united-states_san-francisco_2016-11-02_listings.csv.gz
united-states_san-francisco_2016-12-03_listings.csv.gz
--------------------------------------------
2017
united-states_san-francisco_2017-01-01_listings.csv.gz
united-states_san-francisco_2017-02-02_listings.csv.gz
united-states_s

In [13]:
def concat_spreadsheets(concat_year):
    
    """
    This function accepts a starting and ending index of numFiles 
    as arguments and returns a cocatanated dataframe of the csv data 
    corresponding to the inputted indexes.
    """  
    yearly_numfiles_bool = []
    for filename in numFiles:
        if (filename.split('_')[2][0:4] == str(int(concat_year))):
            yearly_numfiles_bool.append(True)
        else:
            yearly_numfiles_bool.append(False)
    
    sheets_df = []
    yearly_numfiles = list(compress(numFiles, yearly_numfiles_bool))
    for filename in yearly_numfiles:
        df = pd.read_csv(data_dir + '/' + filename, index_col = None, header=0, compression='gzip', encoding='iso-8859-1') 
        sheets_df.append(df)
        
    sheets_df = pd.concat(sheets_df, axis=0, ignore_index=True)
    return sheets_df

In [14]:
# FOR HARRISON: Make sure yearly concatenates are correct here and that they are no
# longer hard-coded. You shouldn't have to manually enter all of the files
# NOTE FROM HARRISON: I got rid of hardcoding. It's a little slow but not too bad.

# Save the concatenated sheets as separate files
sheet15 = concat_spreadsheets(2015) # 2015
sheet16 = concat_spreadsheets(2016) # 2016
sheet17 = concat_spreadsheets(2017) #2017
sheet18 = concat_spreadsheets(2018) #2018
sheet19 = concat_spreadsheets(2019) # 2019

uniq_all = pd.concat([sheet15.id, sheet16.id, sheet17.id, sheet18.id, sheet19.id], axis=0, ignore_index=True).unique()

In [15]:
uniq_all

array([  295240,  4882829,  4850420, ..., 40547706, 40560328, 40569280])

In [16]:
# Here I'm seeing if there is an issue with the DataFrame "sheet18"
sheet18[sheet18['id']==2134100][['id','host_name','host_since','host_id','last_scraped','price']]

Unnamed: 0,id,host_name,host_since,host_id,last_scraped,price
13,2134100,Marty And Alexis,2013-12-29,10889606,2018-01-10,$130.00
8584,2134100,Marty And Alexis,2013-12-29,10889606,2018-01-17,$115.00
12586,2134100,Marty And Alexis,2013-12-29,10889606,2018-02-02,$80.00
17319,2134100,Marty And Alexis,2013-12-29,10889606,2018-03-04,$100.00
21267,2134100,Marty And Alexis,2013-12-29,10889606,2018-04-06,$100.00
26942,2134100,Marty And Alexis,2013-12-29,10889606,2018-05-09,$121.00
31683,2134100,Marty And Alexis,2013-12-29,10889606,2018-07-05,$121.00
36577,2134100,Marty And Alexis,2013-12-29,10889606,2018-08-06,$121.00
43184,2134100,Marty And Alexis,2013-12-29,10889606,2018-09-08,$121.00
49708,2134100,Marty And Alexis,2013-12-29,10889606,2018-10-03,$121.00


# 1. Create wide form of full data

In [17]:
# HARRISON: Once the unique ids have been correctly concatenated then the wide form function should work.

In [18]:
raw_data_path = data_dir
def wide_form(UNIQ_IDS, START, END, METRICS): 
    
    """
    This function takes a list of Airbnb unique ids as well as start
    and end indexes for said list. Then, for the selected ID's
    it returns a dataframe of the relevant data in a wide format.
    """
    
    listing_df = pd.DataFrame(UNIQ_IDS)
    listing_df.columns = ['id']
    
    print("Number of unique listings: " + str(len(listing_df)))
    
    output_df = listing_df.copy()    
    
    date_count = START
    for i in numFiles[START:END]:
 
        file = open(os.path.join(raw_data_path, i), encoding= 'ISO-8859-1')
        reader = csv.reader(file, delimiter=',')
        headers = next(reader)

        bnb_metrics = METRICS
        
        # Debugging print(headers)
        # Debugging print(type(headers))

        d={}
        for j in bnb_metrics:
            d[str(j)+"_index"] = headers.index(j)
            print(str(j))
            print(d[str(j)+"_index"])

        row_values = []

        for row in reader:
            value_i = []

            for j in bnb_metrics:
                value_j = IntorStr(row[d[str(j)+"_index"]])
                value_i.append(value_j)

            row_values.append(value_i)
        
        values_df = pd.DataFrame(row_values) # Create a dataframe for the row_values      
        values_df.columns = bnb_metrics # Set column titles
        values_df = values_df.drop_duplicates(subset='id', keep='last')

        # Merge the values with their respective id and drop duplicates
        merged_df = pd.merge(listing_df, values_df, how='outer', on='id')
        merged_df = merged_df.drop_duplicates(keep='first')   
        merged_df = merged_df.reset_index()

        for k in bnb_metrics[1:]: # I don't want this loop to include 'id', hence [1:] range is used
            output_df[k + str(date_count)] = merged_df[k]

        output_df['List_month'+str(date_count)] = listing_df['id'].isin(np.array(values_df['id']))*1 # See if the observation is in the month data  

        date_count += 1
        
    return output_df

In [19]:
# THIS SAVES WIDE FORMAT DATASET

wideALL = wide_form(uniq_all, 1, len(numFiles)-2, compatible)

Number of unique listings: 32030


ValueError: 'calculated_host_listings_count' is not in list

In [None]:
wideALL.to_csv(os.path.join(csv_save_path,r'Data_wideALL_v2.csv'), compression = 'gzip')

In [None]:
list(wideALL.columns)

In [None]:
x = set(wideALL.columns)
y = set(compatible)
x.intersection(y)

In [None]:
compatible.remove('id')

---------

# 2. Create long form of full data

In [None]:
def long_form(df, METRICS):
    bnb_metrics = METRICS
    mylist = ['List_month']
    mylist.extend(bnb_metrics[1:])
    long_df = pd.wide_to_long(df, stubnames=mylist, i='id', j='month')
    
    return long_df

In [None]:
#longALL = long_form(wideALL, compat)
#longALL.to_csv(os.path.join(csv_save_path,r'Data_longALL_v1.csv'))
longALL.to_csv(os.path.join(csv_save_path,r'Data_longALL_v3.csv.gz'), compression='gzip')

In [None]:
longALL.reset_index()['last_scraped'].unique()

# Appendix

In [None]:
airbnb_full_metrics = ['id', 'last_scraped', 'host_id', 'host_name', 'host_since', 'host_location', 
                       'host_response_time', 'host_response_rate', 'host_is_superhost', 'host_listings_count', 
                       'host_total_listings_count', 'neighbourhood','neighbourhood_cleansed', 'city', 'street', 'zipcode', 
                       'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates',
                       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price',
                       'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 
                       'minimum_nights', 'maximum_nights', 'calendar_updated', 'calendar_last_scraped', 
                       'has_availability', 'availability_30', 'availability_60', 'availability_90', 
                       'availability_365', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating',
                       'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                       'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                       'requires_license', 'license', 'instant_bookable', 'cancellation_policy', 
                       'calculated_host_listings_count', 'reviews_per_month', 'amenities']
okchild = pd.read_csv('united-states_austin_2020-03-17_listings.csv')
airbnb_metrics = list(okchild.columns)
to_remove = ['host_total_listings_count', 'security_deposit', 'has_availability',  'cancellation_policy', 'access', 'interaction', 'house_rules'
            ,'minimum_minimum_nights', 'maximum_minimum_nights','minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm',
            'maximum_nights_avg_ntm', 'number_of_reviews_ltm','is_business_travel_ready','calculated_host_listings_count_entire_homes',
            'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','summary']
airbnb_metrics_compat = list(filter(lambda i: i not in to_remove, airbnb_metrics))#airbnb_full_metrics