# Ingest Airbnb Data and Upload to S3

In [1]:
import pandas as pd
import boto3
import os
from dotenv import load_dotenv

import requests
from bs4 import BeautifulSoup

# # load environment variables
load_dotenv()

True

## 1. Initialize boto3 client

In [2]:
# boto3 will initialize connection using environment variables
s3 = boto3.resource('s3')

## 2. Create functions
#### 2.1 Function to retrieve URLs for listings, reviews, and geospatial data from Inside Airbnb

In [3]:
# Funtion to download Inside Airbnb page and get URLs for desired file types and market
def download_airbnb_urls(market):
    url = "https://insideairbnb.com/get-the-data/"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        urls = []
        for link in links:
            href = link['href']
            # Check if the link ends with .csv.gz or .geojson and contains the market name
            if (href.endswith('.csv.gz') or href.endswith('.geojson')) and market.lower() in href.lower():
                # Exclude links that contain 'calendar.csv.gz'
                if 'calendar.csv.gz' not in href:
                    urls.append(href)
        return urls
    else:
        raise ValueError(f"Failed to download Inside Airbnb page. Status code: {response.status_code}")

#### 2.2 Function to download data from URLs

In [4]:
def download_data(market, urls):
    print(f'Download starting for {market}')
    
    data = {} # store downloaded data in a dictionary
    for url in urls:
        print(f"Downloading file from: {url}")
        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                data[url] = response.content
            else:
                print(f"Failed to download data from {url}. Status code: {response.status_code}")
        except Exception as e:
            print(f"Exception occurred while downloading {url}: {str(e)}")
    return data

#### 2.3 Function to upload data to S3

In [5]:
def upload_to_s3(data, bucket_name, market_name):
    print('Upload starting...')
    s3 = boto3.client('s3')


    # s3_key represents unique identifier for the file in S3
    for key, value in data.items():
        # Extract filename from URL
        filename = key.rsplit('/', 1)[-1]
        
        if filename == 'listings.csv.gz':
            s3_key = f"raw/listings/{market_name}-listings.csv.gz"
        elif filename == 'neighbourhoods.geojson':
            s3_key = f"raw/geospatial/{market_name}-neighbourhoods.geojson"
        elif filename == 'reviews.csv.gz':
            s3_key = f"raw/reviews/{market_name}-reviews.csv.gz"

        s3.put_object(Bucket=bucket_name, Key=s3_key, Body=value)
        print(f"Data uploaded to S3 bucket '{bucket_name}' with key '{s3_key}'")

## 3. Ingest and upload market data to S3 bucket
Estimated time to download market data and upload is 15 to 20 minutes. Provide a list of markets.

In [6]:
markets = ['albany','los-angeles','san-francisco','new-york-city','chicago','seattle','washington-dc']

In [7]:
for market in markets:
    # Doanload Inside Airbnb page for URLs ending with .csv.gz or .geojson for the specified market
    downloaded_urls = download_airbnb_urls(market)

    # S3 bucket and prefix where you want to upload the data
    bucket_name = 'airbnb-capstone-project'
    s3_prefix = market

    # Download data from doanloaded URLs
    data = download_data(market, downloaded_urls)

    # Upload data to S3
    upload_to_s3(data, bucket_name, s3_prefix)

Download starting for albany
Downloading file from: https://data.insideairbnb.com/united-states/ny/albany/2024-06-07/data/listings.csv.gz
Downloading file from: https://data.insideairbnb.com/united-states/ny/albany/2024-06-07/data/reviews.csv.gz
Downloading file from: https://data.insideairbnb.com/united-states/ny/albany/2024-06-07/visualisations/neighbourhoods.geojson
Upload starting...
Data uploaded to S3 bucket 'airbnb-capstone-project' with key 'raw/listings/albany-listings.csv.gz'
Data uploaded to S3 bucket 'airbnb-capstone-project' with key 'raw/reviews/albany-reviews.csv.gz'
Data uploaded to S3 bucket 'airbnb-capstone-project' with key 'raw/geospatial/albany-neighbourhoods.geojson'
Download starting for los-angeles
Downloading file from: https://data.insideairbnb.com/united-states/ca/los-angeles/2024-06-07/data/listings.csv.gz
Downloading file from: https://data.insideairbnb.com/united-states/ca/los-angeles/2024-06-07/data/reviews.csv.gz
Downloading file from: https://data.insid

### 4. Delete downloaded data
Delete data downloaded onto local machine.

In [None]:
del data