In [315]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random

In [316]:
!pip install bs4



In [317]:
def scrape_page(page_url):
    """Extracts HTML from a webpage""" 
    answer = requests.get(page_url)
    content = answer.content
    soup = BeautifulSoup(content, features='html.parser')
    return soup

In [318]:
def extract_listing(page_url):
    """Extracts listings from an Airbnb search page"""
    page_soup = scrape_page(page_url)
    formatted_date =  datetime.fromtimestamp(datetime.now().timestamp())
    print("Extracted Data:\t",formatted_date)
    return page_soup

In [319]:
def process_anchors(className, anchors, printData):   
    overall_data = {}
    if printData:
        print("FOR className={}:".format(className))
        print("ANCHOR_TEXT::\tANCHOR_DATA")
    for anchor in anchors:
        text, data = get_anchor_data(anchor)
        if printData:
            print("{}::\t{}".format(text,data))
        overall_data[text] = data
    return overall_data

def get_anchor_data(anchor):
    text= anchor.get("aria-label") if anchor.get_text() == "" else anchor.get_text()
    data = anchor.get("href")
    return text, data

In [320]:
def extract_anchors(soup, classes, printData=False):       
    processed_classes = [process_anchors(className, soup.find_all("a", {"class": className}), printData) for className in classes]
    if not len(processed_classes):
        raise ValueError("processed_anchors empty!")
    print("DONE")
    return processed_classes

In [321]:
#Url to scrape
page_url = "https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST"

# Extract html page
soup = extract_listing(page_url)

Extracted Data:	 2022-11-22 04:30:01.593342


In [322]:
button_classes = ["_833p2h", "_1bfat5l"]
anchor_data = extract_anchors(soup, button_classes)

DONE


In [323]:
def get_urls(anchor_data):
    number_anchors = anchor_data[0]
    next_anchor = anchor_data[1]
    base_url = "https://www.airbnb.com"
    urls = ["https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST"] 
    for key, value in number_anchors.items():
        urls.append(str(base_url + number_anchors[key]))
    next_url =  str(base_url + next_anchor["Next"])
    return urls

In [324]:
urls = get_urls(anchor_data)

In [325]:
urls

['https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST',
 'https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip

In [326]:
soups =  [extract_listing(url) for url in urls]

Extracted Data:	 2022-11-22 04:30:03.285098
Extracted Data:	 2022-11-22 04:30:04.581487
Extracted Data:	 2022-11-22 04:30:05.892860
Extracted Data:	 2022-11-22 04:30:07.986073
Extracted Data:	 2022-11-22 04:30:09.288517


In [327]:
listing_pages = [soup.find_all("div", {"class": "g1qv1ctd cb4nyux dir dir-ltr"}) for soup in soups]

In [328]:
def extract_listing_data(listing):
    listing_data = {}
    listing_data["title"] = listing.find("div", {"class": "t1jojoys dir dir-ltr"}).contents[0] # title
    listing_data["description"] = listing.find("span", {"class": "t6mzqp7 dir dir-ltr"}).contents[0] # Span description
    listing_data["num_bed"] = listing.find("span", {"class": "dir dir-ltr"}).contents[0] # div num beds
    listing_data["price_val"] = listing.find("span", {"class": "_tyxjp1"}).contents[0] # price_val
    price_per_night_data = listing.find("span", {"class": "a8jt5op dir dir-ltr"}).contents[0] # price per night
    listing_data["price_per_night"] = " ".join(str(s) for s in price_per_night_data)
    listing_data["average_rating_num_rating"] = listing.find("span", {"class": "r1dxllyb dir dir-ltr"}).contents[0] if listing.find("span", {"class": "r1dxllyb dir dir-ltr"}) else "NONE"  # Average rating and num of ratings
    
    return listing_data

In [329]:
def get_df_data(listing_pages):
    df_data = list()
    for listings in listing_pages:
    # Containers used to store div data
        a = []
        b = []
        c = []
        d = []
        e = []
        f = []
        for listing in listings:
            listing_data = extract_listing_data(listing)
            a.append(listing_data["title"])
            b.append(listing_data["description"])
            c.append(listing_data["num_bed"])
            d.append(listing_data["price_val"])
            e.append(listing_data["price_per_night"])
            f.append(listing_data["average_rating_num_rating"])
        df_data.append({
            "title": a,
            "description": b,
            "num_beds": c ,
            "price_val": d ,
            "price_per_night": e,
            "average_rating_num_rating": f
        })
        print("Processed soup: {}".format(len(df_data)))
    return df_data

In [330]:
df_data = get_df_data(listing_pages) 

Processed soup: 1
Processed soup: 2
Processed soup: 3
Processed soup: 4
Processed soup: 5


In [332]:
count=1
for i in df_data:
    file_name = "data__{}__{}.csv".format(count, random.getrandbits(128))
    pd.DataFrame(i).to_csv(file_name)
    print("Generated_file: {}".format(file_name))
    count+=1

Generated_file: data__1__31795517795597892881540672643605775433.csv
Generated_file: data__2__104957571114765982144869969930715074396.csv
Generated_file: data__3__101110249331255956243337430045466564266.csv
Generated_file: data__4__81442602203516615734056991693959166110.csv
Generated_file: data__5__63430756976567569162947827422023531925.csv
