In [None]:
#### Dataset download

This notebook downloads the required data sets from Inside Airbnb. For the calendar data, multiple calendar data sets are put together, in order to cover the whole year 2019 in one calendar datat set.

In detail, the following data sets are downloaded:
* All available 'listings', 'reviews' and 'calendar' files for Munich and Berlin in 2018 and 2019.
* 'Neighbourhood' geojson files for Munich and Berlin 

In [5]:
import requests
import numpy as np
import pandas as pd
import os
import datetime


In [6]:
# Set up URL and file names for download

base_url_munich_2019 = "http://data.insideairbnb.com/germany/bv/munich/2019-"
base_url_munich_2018 = "http://data.insideairbnb.com/germany/bv/munich/2018-"

base_url_berlin_2019 = "http://data.insideairbnb.com/germany/be/berlin/2019-"
base_url_berlin_2018 = "http://data.insideairbnb.com/germany/be/berlin/2018-"


listings_details_file = "listings.csv.gz"
calendar_file = "calendar.csv.gz"
reviews_details_file = "reviews.csv.gz"
listings_file = "listings.csv"
reviews_file = "reviews.csv"
neighbourhoods_file = "neighbourhoods.csv"
neigbourhoods_json_file = "neighbourhoods.geojson"

file_names = [listings_details_file, calendar_file, reviews_details_file, listings_file, reviews_file, 
              neighbourhoods_file, neigbourhoods_json_file]


In [9]:
# We don't know the exact date the data were made available, so we need to try out each day in the year
# Set up arrays for months and days to iterate through

months = np.array(range(1, 13)).astype(str)
days = np.array(range(1, 32)).astype(str)

def add_leading_zero(arr):
    for i, el in enumerate(arr):
        if (int(el) < 10):
            arr[i] = "0" + el

add_leading_zero(months)
add_leading_zero(days)


import urllib.request
import gzip, shutil
def download_airbnb_files(base_url, file_name, folder_name, year_string):
    for m in reversed(months):
        for d in reversed(days):
            url = base_url + m + '-' + d + '/data/' + file_name
            prefix = folder_name + year_string + '_' + m + '_' + d + '_'
            try:
                #print(prefix + file_name)
                #print(url)
                with urllib.request.urlopen(url) as response, open(prefix + file_name, 'wb') as out_file:
                    #print(url)
                    #print("...downloaded file")
                    data = response.read() 
                    out_file.write(data)
                    out_file.close()
                    if (file_name != calendar_file): 
                        # only for the calender, we need all available files
                        # for the other files, the latest file is enough
                        return
            except Exception as e:
                pass


for f in file_names:
    download_airbnb_files(base_url_munich_2018, f, './Munich/', '2018')
    download_airbnb_files(base_url_munich_2019, f, './Munich/', '2019')
    download_airbnb_files(base_url_berlin_2018, f, './Berlin/', '2018')
    download_airbnb_files(base_url_berlin_2019, f, './Berlin/', '2019')
            


In [4]:
import gzip
import shutil
import os

def extract_files(folder):
    directory = os.fsencode(folder)
    for file_name in os.listdir(directory):
        file_name = str(file_name).strip("b\'")
        parts = file_name.split('_')
        print(parts)
        name = parts[3]
            
        if (name == "calendar.csv.gz"):
            print(name)
            # extract file, add date to file name
            date = parts[0] + '_' + parts[1] + '_' + parts[2]
            
            with gzip.open(folder + file_name, 'r') as f_in, open(folder + date + '_calendar.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    
        elif (name == "listings.csv.gz"):
            print(name)
            # extract file
            with gzip.open(folder + file_name, 'r') as f_in, open(folder + 'listings.csv', 'wb')as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        elif (name == "reviews.csv.gz"):
            # extract file
            with gzip.open(folder + file_name, 'r') as f_in, open(folder + 'reviews.csv', 'wb')as f_out:
                shutil.copyfileobj(f_in, f_out)


extract_files('./Munich/')
extract_files('./Berlin/')

['2019', '03', '15', 'calendar.csv']
['2019', '03', '15', 'calendar.csv.gz']
calendar.csv.gz
['2019', '05', '22', 'calendar.csv']
['2019', '05', '22', 'calendar.csv.gz']
calendar.csv.gz
['2019', '06', '24', 'calendar.csv']
['2019', '06', '24', 'calendar.csv.gz']
calendar.csv.gz
['2019', '07', '16', 'calendar.csv']
['2019', '07', '16', 'calendar.csv.gz']
calendar.csv.gz
['2019', '08', '24', 'calendar.csv']
['2019', '08', '24', 'calendar.csv.gz']
calendar.csv.gz
['2019', '09', '24', 'calendar.csv']
['2019', '09', '24', 'calendar.csv.gz']
calendar.csv.gz
['2019', '10', '20', 'calendar.csv']
['2019', '10', '20', 'calendar.csv.gz']
calendar.csv.gz
['2019', '11', '25', 'calendar.csv']
['2019', '11', '25', 'calendar.csv.gz']
calendar.csv.gz
['2019', '12', '26', 'calendar.csv']
['2019', '12', '26', 'calendar.csv.gz']
calendar.csv.gz
['2019', '12', '26', 'listings.csv']
['2019', '12', '26', 'listings.csv.gz']
listings.csv.gz
['2019', '12', '26', 'reviews.csv']
['2019', '12', '26', 'reviews.csv.

IndexError: list index out of range

In [None]:
# The neighbourhood information is not included, download from 2021
berlin_neighbourhoods = "http://data.insideairbnb.com/germany/be/berlin/2021-02-20/visualisations/neighbourhoods.csv"
berlin_neighbourhoods_json = "http://data.insideairbnb.com/germany/be/berlin/2021-02-20/visualisations/neighbourhoods.geojson"
munich_neighbourhoods = "http://data.insideairbnb.com/germany/bv/munich/2021-02-23/visualisations/neighbourhoods.csv"
munich_neighbourhoods_json = "http://data.insideairbnb.com/germany/bv/munich/2021-02-23/visualisations/neighbourhoods.geojson"

def download_neighbourhood_airbnb_file(url, file_name, folder):
    with urllib.request.urlopen(url) as response, open(folder + file_name, 'wb') as out_file:
        data = response.read() 
        out_file.write(data)
        out_file.close()

download_neighbourhood_airbnb_file(berlin_neighbourhoods, 'neighbourhoods.csv', './Berlin/')
download_neighbourhood_airbnb_file(berlin_neighbourhoods_json, 'neighbourhoods.geojson', './Berlin/')
download_neighbourhood_airbnb_file(munich_neighbourhoods, 'neighbourhoods.csv', './Munich/')
download_neighbourhood_airbnb_file(munich_neighbourhoods_json, 'neighbourhoods.geojson', './Munich/')
    

In [None]:
# Putting together the calendar information for one year:
def get_dates_and_file_names_of_cal_csvs(folder):
    '''
    Based on all calendar files in the given folder, 
    returns a dictionary whith a mapping from a datetime to the corresponding calendar file name.
    '''
    date_to_file_name = dict()
    
    # Build dict with the dates of each calendar files as key and the file name as value
    directory = os.fsencode(folder)
    for file_name in os.listdir(directory):
        file_name = str(file_name).strip("b\'")
        if (file_name.endswith('calendar.csv')):
            parts = file_name.split('_')
            name = parts[3]
            date = datetime.datetime(int(parts[0]), int(parts[1]), int(parts[2]))
            date_to_file_name[date] = file_name
    return date_to_file_name

In [None]:

def read_cal_dfs(folder):
    '''
    Returns a DataFrame put together based on all available calendar files in the given folder.
    Considers from each calendar file only those days before the next calendar file begins.
    The DateFrame lists for each date of the year the average price, number of available listings, 
    and number of bookings, as well as the occupancy rate (bookings / listings)
    '''
    print(folder)
     
    # Build dict with dates and corresponding file names
    date_to_file_name = get_dates_and_file_names_of_cal_csvs(folder)
        
    # For each date, read in the corresponding csv file
    # Consider only data points before next file begins
    # Do some data cleaning and wrangling
    dates = list(date_to_file_name.keys())
    year_df = pd.DataFrame([])
    for i in range(len(dates)):
        
        date = dates[i]
        df = pd.read_csv(folder + date_to_file_name[date], parse_dates=['date'])
           
        if (i < len(dates)-1):
            next_date = dates[i+1]
            
            df_next = pd.read_csv(folder + date_to_file_name[next_date], parse_dates=['date'])
            next_date_in_csv = df_next['date'].min()
            print(next_date_in_csv)
            
            # only the listings until the next dataframe begins
            df = df[df['date'] < next_date_in_csv]
        else:
            df = df[df['date'] < datetime.datetime(2020, 1, 1)]
        
        # Convert 'price' and 'adjusted_price' to float
        if 'price' in df.columns:
            df['price'] = df['price'].str.replace(',', '').str.strip('$').astype(float)
        if 'adjusted_price' in df.columns:
            df['adjusted_price'] = df['adjusted_price'].str.replace(',', '').str.strip('$').astype(float)
                        
        # Convert 'available' to boolean
        df['available'] = df['available'].map({'t': True, 'f': False})
        
        # Add dummy column for count aggregation
        df['listings'] = 1
        # group by date, keep mean for 'available' and 'price' and count for 'listings'
        df = df.groupby('date').agg({'available': ['mean'], 'price': ['mean'], 'listings': ['count']})
        df['available'] = 1-df['available']
        df = df.rename(columns={'available':'occupancy'})
        
        # Add column indicating the date of the calendar file used
        df['calendar_file_date'] = date
        
        df.columns = df.columns.get_level_values(0)
        
        # append this to year_df
        year_df = year_df.append(df)
        
    # Retrieve number of bookings from occupancy rate and number of listings 
    year_df['bookings'] = year_df['occupancy'] * year_df['listings']
    return year_df

year_df_m = read_cal_dfs('./Munich/')
year_df_b = read_cal_dfs('./Berlin/')

In [None]:
date_to_file_name = get_dates_and_file_names_of_cal_csvs('./Munich/')
dates = list(date_to_file_name.keys())
dates

In [None]:
year_df_m.tail(10)

In [None]:
year_df_m

In [None]:
# Write calendar files to csv
year_df_m.to_csv('./data/year_2019_Munich.csv')

year_df_b_2019 = year_df_b[year_df_b.index > datetime.datetime(2018, 12, 1)]
year_df_b_2019.to_csv('./data/year_2019_Berlin.csv')