# Aggregate Dataset

* This Notebook concatenates Airbnb listing files from http://insideairbnb.com/ and creates both wide form and long form aggregate datasets.
* The datasets are represent a balanced panel for a given set of unique Airbnb listings. When a listing doesn't appear in a given month it is still asssigned an entry in the dataset.

## Import and Setup

In [1]:
import os
import csv
import numpy as np
import pandas as pd
import gzip
from itertools import compress

In [2]:
city_folder = '/united-states_portland'
city_abbrev = 'POR'

In [3]:
# Store preliminary directory, use of os should make this compatible for any user with access to the repository
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Make sure repository has a 0. Raw data folder!
data_dir = cwd2 + '/0. Raw data' + city_folder

# Revert to preliminary directory
os.chdir(cwd1)

In [4]:
# This function converts values into an integer, if it fails return a string.

def IntorStr(value):
    try:
        return int(value)
    except:
        return str(value)

## Collecting listings.csv.gz files

In [5]:
# Collect the listings CSVs

numFiles = []
fileNames = os.listdir(data_dir)
for fileNames in fileNames:
    if fileNames.endswith("_listings.csv.gz"):
        numFiles.append(fileNames)
    
city = numFiles[0].split("_")[1]
print(city)

# Count the number of files
numFiles = np.sort(numFiles)
print(len(numFiles))

# Take a look at the first 5 listing files
print(numFiles[:5])

portland
39
['united-states_portland_2015-03-01_listings.csv.gz'
 'united-states_portland_2015-05-12_listings.csv.gz'
 'united-states_portland_2015-09-02_listings.csv.gz'
 'united-states_portland_2015-11-02_listings.csv.gz'
 'united-states_portland_2015-12-02_listings.csv.gz']


## Check if a file is missing specific columns
The loop below accepts a list of data file names and a list of column names and then prints if a file is missing a particular variable.

In [6]:
# First take a look at columns in first four files

for my_file in numFiles[0:4]:
    data = pd.read_csv(data_dir + '/' + my_file, encoding= 'iso-8859-1')
    data_columns = list(data.columns)
    print("N columns: " + str(len(data_columns)))

    if my_file == numFiles[0]:
        print(data_columns)
    
    print("---------------------")
    
# Delete first two files because they are missing some variables of interest
numFiles = np.delete(numFiles, [0,1])

N columns: 64
['Unnamed: 0', 'id', 'scrape_id', 'last_scraped', 'name', 'picture_url', 'host_id', 'host_name', 'host_since', 'host_picture_url', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'city', 'state', 'zipcode', 'market', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_is_superhost', 'host_about', 'host_neighbourhood', 'host_location', 'host_acceptance_rate', 'host_listings_count', 'host_tot

In [7]:
def check_data_cols(files, variables):
    
    """ 
    This function accepts a list of data file names (strings) 
    and a list of column names (strings) and then prints if 
    a file is missing a particular variable.
    """
    
    for my_file in files:
        data = pd.read_csv(data_dir + '/' + my_file, encoding = 'iso-8859-1')
        data_columns = list(data.columns)
        
        for my_column in variables:
            if my_column not in data_columns:
                print(my_column + " missing from:")
                print(my_file)
        
        print("------")

In [8]:
# This loop is relatively slow. It takes about a minute and a half.
def find_compatible_columns(most_missing_columns):
    
    """ 
    This function accepts a filename (string). The function then 
    returns a list of the column names that exist in all files
    in numFiles based on the column names of the passed filename.
    """
    
    a = set(most_missing_columns.columns)
    
    for i in numFiles:
        data = pd.read_csv(data_dir + '/'  + i, encoding = 'iso-8859-1')
        data_columns = set(data.columns)
    
        if i == numFiles[0]:
            compat = data_columns.intersection(a)
        
        compat = compat.intersection(data_columns)
    
    compat = list(compat)
    
    return compat

In [9]:
# This cell is city-specific. Need to be careful if not using Portland!

# This feeds the file with the most missing columns into the above function to create a compatible column set for all the data..
most_missing = pd.read_csv(data_dir  + '/united-states_portland_2015-09-02_listings.csv.gz', encoding='iso-8859-1')
compatible = find_compatible_columns(most_missing)

# Make id the first cell, I think this is more important to the long_form code than deleting id!
compatible.remove('id')
compatible.insert(0, 'id')

In [10]:
# This code creates and displays a list of important variables not found in the compatible set.

airbnb_metrics = ['id', 'last_scraped', 'host_id', 'host_name', 
                  'host_since', 'host_location', 'host_response_time', 'host_response_rate',
                  'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'neighbourhood',
                  'neighbourhood_cleansed', 'street', 'zipcode', 'latitude', 
                  'longitude', 'is_location_exact', 'property_type', 'room_type', 
                  'accommodates', 'bathrooms', 'bedrooms', 'beds', 
                  'bed_type', 'square_feet', 'price', 'weekly_price',
                  'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included',
                  'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 
                  'calendar_last_scraped', 'has_availability', 'availability_30', 'availability_60', 
                  'availability_90', 'availability_365', 'number_of_reviews', 'first_review', 
                  'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                  'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 
                  'requires_license', 'license', 'instant_bookable', 'cancellation_policy',
                  'calculated_host_listings_count', 'reviews_per_month', 'amenities']

airbnb_metrics_vs_compatible = list(filter(lambda i: i not in compatible, airbnb_metrics))
airbnb_metrics_vs_compatible

[]

In [11]:
# Uncomment to print which datasets are missing the airbnb metrics data.
# check_data_cols(numFiles, airbnb_metrics_vs_compatible)

In [12]:
# Display files by year

years = np.linspace(2015,2019, 6)

for year in years:
    print(str(int(year)))
    for IND in range(len(numFiles)):
            if numFiles[IND].split('_')[2][0:4] == str(int(year)):
                print(numFiles[IND])
    print("--------------------------------------------")

2015
united-states_portland_2015-09-02_listings.csv.gz
united-states_portland_2015-11-02_listings.csv.gz
united-states_portland_2015-12-02_listings.csv.gz
--------------------------------------------
2015
united-states_portland_2015-09-02_listings.csv.gz
united-states_portland_2015-11-02_listings.csv.gz
united-states_portland_2015-12-02_listings.csv.gz
--------------------------------------------
2016
united-states_portland_2016-01-01_listings.csv.gz
united-states_portland_2016-02-03_listings.csv.gz
united-states_portland_2016-04-05_listings.csv.gz
united-states_portland_2016-05-03_listings.csv.gz
united-states_portland_2016-06-03_listings.csv.gz
united-states_portland_2016-07-04_listings.csv.gz
united-states_portland_2016-08-04_listings.csv.gz
united-states_portland_2016-09-04_listings.csv.gz
united-states_portland_2016-11-06_listings.csv.gz
united-states_portland_2016-12-08_listings.csv.gz
--------------------------------------------
2017
united-states_portland_2017-01-04_listings.cs

In [13]:
def concat_spreadsheets(concat_year):
    
    """
    This function accepts a starting and ending index of numFiles 
    as arguments and returns a cocatanated dataframe of the csv data 
    corresponding to the inputted indexes.
    """  
    yearly_numfiles_bool = []
    for filename in numFiles:
        if (filename.split('_')[2][0:4] == str(int(concat_year))):
            yearly_numfiles_bool.append(True)
        else:
            yearly_numfiles_bool.append(False)
    
    sheets_df = []
    yearly_numfiles = list(compress(numFiles, yearly_numfiles_bool))
    for filename in yearly_numfiles:
        df = pd.read_csv(data_dir + '/' + filename, index_col = None, header=0, encoding='iso-8859-1') 
        sheets_df.append(df)
        
    sheets_df = pd.concat(sheets_df, axis=0, ignore_index=True)
    return sheets_df

In [14]:
# This code concatenates all datframes for a given year.

# Save the concatenated frames as separate files
sheet15 = concat_spreadsheets(2015) # 2015
sheet16 = concat_spreadsheets(2016) # 2016
sheet17 = concat_spreadsheets(2017) # 2017
sheet18 = concat_spreadsheets(2018) # 2018
sheet19 = concat_spreadsheets(2019) # 2019

# Identify all unique listings ids across datasets
uniq_all = pd.concat([sheet15.id, sheet16.id, 
                      sheet17.id, sheet18.id, 
                      sheet19.id], axis=0, ignore_index=True).unique()

print(uniq_all)

[ 4986792  3883718  7092722 ... 32129035 32139925 32164834]


## Create wide form of full data

In [15]:
def wide_form(UNIQ_IDS, START, END, METRICS): 
    
    """
    This function takes a list of Airbnb unique ids as well as start
    and end indexes for said list. Then, for the selected ID's
    it returns a dataframe of the relevant data in a wide format.
    """
    
    listing_df = pd.DataFrame(UNIQ_IDS)
    listing_df.columns = ['id']
    
    print("Number of unique listings: " + str(len(listing_df)))
    
    output_df = listing_df.copy()    
    
    date_count = START
    for i in numFiles[START:END]:
 
        # Read in gzip compressed files
        file = gzip.open(os.path.join(data_dir, i), 'rt')  
        reader = csv.reader(file, delimiter=',')
        headers = next(reader)

        bnb_metrics = METRICS
        
        d={}
        for j in bnb_metrics:
            d[str(j)+"_index"] = headers.index(j)
#             print(str(j))
#             print(d[str(j)+"_index"])

        row_values = []

        for row in reader:
            value_i = []

            for j in bnb_metrics:
                value_j = IntorStr(row[d[str(j)+"_index"]])
                value_i.append(value_j)

            row_values.append(value_i)
        
        values_df = pd.DataFrame(row_values) # Create a dataframe for the row_values      
        values_df.columns = bnb_metrics # Set column titles
        values_df = values_df.drop_duplicates(subset='id', keep='last')

        # Merge the values with their respective id and drop duplicates
        merged_df = pd.merge(listing_df, values_df, how='outer', on='id')
        merged_df = merged_df.drop_duplicates(keep='first')   
        merged_df = merged_df.reset_index()

        for k in bnb_metrics[1:]: 
            output_df[k + str(date_count)] = merged_df[k]

        output_df['List_month'+str(date_count)] = listing_df['id'].isin(np.array(values_df['id']))*1 # See if the observation is in the month data  

        date_count += 1
        
    return output_df

-----

In [16]:
# Wide format for all
wideALL = wide_form(uniq_all, 0, len(numFiles), airbnb_metrics)

Number of unique listings: 11307


  output_df[k + str(date_count)] = merged_df[k]
  output_df['List_month'+str(date_count)] = listing_df['id'].isin(np.array(values_df['id']))*1 # See if the observation is in the month data


In [17]:
# Save wide dataframe to local directory
wideALL.to_csv(city_abbrev + '_Data_wideALL.csv.gz', 
               compression = 'gzip', index=False)

---------

# Create long form of full data and save

In [18]:
def long_form(df, METRICS):
    """
    Creates a long form data frame when provided with 
    a wide form dataset.
    """
    bnb_metrics = METRICS
    mylist = ['List_month']
    mylist.extend(bnb_metrics[1:])
    long_df = pd.wide_to_long(df, stubnames=mylist, i='id', j='month')
    
   
    return long_df

In [19]:
# Full long dataframe creation.
longALL = long_form(wideALL, airbnb_metrics)

# Resets the long dataframe's index
longALL = longALL.reset_index()

# Save long dataframe to local directory.
longALL.to_csv(city_abbrev + '_Data_longALL.csv.gz', 
               compression='gzip', index=False)