# Download raw data from original sources
- ~10GB downloaded, 7GB after extracting selected data
- Sources:
    - Airbnb data: http://insideairbnb.com/get-the-data.html, https://public.opendatasoft.com
    - Weather data: www.ecad.eu
- Links at http://data.insideairbnb.com/ are maintained for up to a year, see http://insideairbnb.com/get-the-data.html for more information. This project uses data from January - March 2021.
- Weather archives from ECAD at www.ecad.eu are updated regularly to include new measurements. This project uses weather data up to end of March 2021.


In [1]:
import pathlib
import os
import requests
import gzip
from zipfile import ZipFile

In [2]:
data_parent = 'data'

data_cities = [
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-03-04/data/listings.csv.gz",
 "filepath": "cities/Amsterdam/2021-03/listings.csv"},
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-02-08/data/listings.csv.gz", 
 "filepath": "cities/Amsterdam/2021-02/listings.csv"},
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-01-09/data/listings.csv.gz" , 
 "filepath": "cities/Amsterdam/2021-01/listings.csv"},
    
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-03-04/data/reviews.csv.gz" , 
 "filepath": "cities/Amsterdam/2021-03/reviews.csv"},
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-02-08/data/reviews.csv.gz", 
 "filepath": "cities/Amsterdam/2021-02/reviews.csv"},
{"url": "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2021-01-09/data/reviews.csv.gz", 
 "filepath": "cities/Amsterdam/2021-01/reviews.csv"},
    
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-03-12/data/listings.csv.gz", 
 "filepath": "cities/Berlin/2021-03/listings.csv"},
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-02-20/data/listings.csv.gz", 
 "filepath": "cities/Berlin/2021-02/listings.csv"},
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-01-19/data/listings.csv.gz", 
 "filepath": "cities/Berlin/2021-01/listings.csv"},   
    
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-03-12/data/reviews.csv.gz", 
 "filepath": "cities/Berlin/2021-03/reviews.csv"},
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-02-20/data/reviews.csv.gz" , 
 "filepath": "cities/Berlin/2021-02/reviews.csv"},
{"url": "http://data.insideairbnb.com/germany/be/berlin/2021-01-19/data/reviews.csv.gz", 
 "filepath": "cities/Berlin/2021-01/reviews.csv"},

{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-03-05/data/listings.csv.gz", 
 "filepath": "cities/London/2021-03/listings.csv"},   
{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-02-09/data/listings.csv.gz", 
 "filepath": "cities/London/2021-02/listings.csv"},   
{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-01-11/data/listings.csv.gz", 
 "filepath": "cities/London/2021-01/listings.csv"},   

{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-03-05/data/reviews.csv.gz", 
 "filepath": "cities/London/2021-03/reviews.csv"},   
{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-02-09/data/reviews.csv.gz", 
 "filepath": "cities/London/2021-02/reviews.csv"},   
{"url": "http://data.insideairbnb.com/united-kingdom/england/london/2021-01-11/data/reviews.csv.gz", 
 "filepath": "cities/London/2021-01/reviews.csv"},   
    
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-03-04/data/listings.csv.gz", 
 "filepath": "cities/Paris/2021-03/listings.csv"},   
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-02-08/data/listings.csv.gz", 
 "filepath": "cities/Paris/2021-02/listings.csv"},   
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-01-10/data/listings.csv.gz", 
 "filepath": "cities/Paris/2021-01/listings.csv"},   
   
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-03-04/data/reviews.csv.gz", 
 "filepath": "cities/Paris/2021-03/reviews.csv"},   
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-02-08/data/reviews.csv.gz", 
 "filepath": "cities/Paris/2021-02/reviews.csv"},   
{"url": "http://data.insideairbnb.com/france/ile-de-france/paris/2021-01-10/data/reviews.csv.gz", 
 "filepath": "cities/Paris/2021-01/reviews.csv"}   
]

data_global_listings = \
{"url": "https://public.opendatasoft.com/explore/dataset/airbnb-listings/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B", 
 "filepath": "airbnb-listings.csv"} 

data_weather = [
{"url": "https://knmi-ecad-assets-prd.s3.amazonaws.com/download/ECA_blend_tg.zip",
 "filepath": "weather/ECA_blend_tg",
 "files": ["TG_STAID000041.txt", "TG_STAID000593.txt", "TG_STAID001860.txt", "TG_STAID011249.txt"]}, 
{"url": "https://knmi-ecad-assets-prd.s3.amazonaws.com/download/ECA_blend_rr.zip",
 "filepath": "weather/ECA_blend_rr",
 "files": ["RR_STAID000041.txt", "RR_STAID000593.txt", "RR_STAID001860.txt", "RR_STAID011249.txt"]}
]

## 1. Download monthly listings and reviews data from Inside Airbnb

In [3]:
# Create folder structure
for item in data_cities:
    pathlib.Path(data_parent,*(item['filepath'].split("/")[0:-1])).mkdir(parents=True, exist_ok=True)  
    
# Download data, extract and save 
# Local reviews and listings
for item in data_cities:
    filename = item['url'].split('/')[-1]
        
    response = requests.get(item['url'], allow_redirects=True)
    archive_path = pathlib.Path(data_parent,filename)
    
    open(archive_path, 'wb').write(response.content)          
    with gzip.open(archive_path, 'rb') as f:
        file_content = f.read()
        with open(pathlib.Path(data_parent,item['filepath']), 'wb') as f2:
            f2.write(file_content) 

## 2. Download global listings from opendatasoft

In [None]:
# Global listings
pathlib.Path(data_parent).mkdir(parents=True, exist_ok=True)  
response = requests.get(data_global_listings['url'], allow_redirects=True)
open(pathlib.Path(data_parent,data_global_listings['filepath']), 'wb').write(response.content)

## 3. Download weather data from ecad.eu

In [3]:
# Weather data
for item in data_weather:
    pathlib.Path(data_parent,*(item['filepath'].split("/")[0:-1])).mkdir(parents=True, exist_ok=True)  

for item in data_weather:
    filename = item['url'].split('/')[-1]

    response = requests.get(item['url'], allow_redirects=True)
    archive_path = pathlib.Path(data_parent,filename)

    open(archive_path, 'wb').write(response.content)
    with ZipFile(archive_path, 'r') as zipObj:   
       listOfFileNames = zipObj.namelist()   
       for filename in listOfFileNames:       
           if filename in item['files']:           
               zipObj.extract(filename, pathlib.Path(data_parent,item['filepath']))