In [27]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd

In [6]:
# !pip install bs4

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
Collecting soupsieve>1.2
  Using cached soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=36660442cf8b984f78594ad54c5332d880087c00e8475b8deeb0ac39843d6cd9
  Stored in directory: /home/jovyan/.cache/pip/wheels/75/78/21/68b124549c9bdc94f822c02fb9aa3578a669843f9767776bca
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1


In [8]:
def scrape_page(page_url):
    """Extracts HTML from a webpage""" 
    answer = requests.get(page_url)
    content = answer.content
    soup = BeautifulSoup(content, features='html.parser')

    return soup

In [9]:
def extract_listing(page_url):
    """Extracts listings from an Airbnb search page"""
    page_soup = scrape_page(page_url)
    formatted_date =  datetime.fromtimestamp(datetime.now().timestamp())
    print("Extracted Data:\t",formatted_date)
    return page_soup

In [10]:
#Url to scrape
page_url = "https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST"

# Extract html page
soup = extract_listing(page_url)

Extracted Data:	 2022-11-16 02:06:10.963081


In [89]:
# Get the outer most element containing the listings
listings = soup.find_all("div", {"class": "g1qv1ctd cb4nyux dir dir-ltr"}) # Most outer div
print(listings[0])
data = []

<div class="g1qv1ctd cb4nyux dir dir-ltr"><div class="t1jojoys dir dir-ltr" id="title_42672172">Private room in New York</div><div class="f15liw5s s1cjsi4j dir dir-ltr"></div><div class="nquyp1l s1cjsi4j dir dir-ltr"><span class="t6mzqp7 dir dir-ltr" lang="en">Near Gramercy Park/ Private Entrance</span></div><div class="f15liw5s s1cjsi4j dir dir-ltr"><span class="dir dir-ltr">1 double bed</span></div><div class="f15liw5s s1cjsi4j dir dir-ltr"><span class="dir dir-ltr">Feb 17 – 19</span></div><div class="p11pu8yw dir dir-ltr"><div style="--pricing-guest-display-price-alignment:flex-start;--pricing-guest-display-price-flex-wrap:wrap;--pricing-guest-primary-line-font-size:15px;--pricing-guest-primary-line-line-height:19px;--pricing-guest-primary-line-unit-price-font-weight:var(--jx-zk-pv);--pricing-guest-primary-line-trailing-content-font-size:14px;--pricing-guest-secondary-line-font-size:15px;--pricing-guest-secondary-line-line-height:19px;--pricing-guest-secondary-line-color:#717171;--p

In [86]:
# Iterate through each listing element and extract necessary elements that contain the necessary fields e.g "title", "price" etc
titles = []
descriptions = []
num_beds = []
price_vals = []
price_per_nights = []
average_rating_num_ratings = []

for x in listings:
    title = x.find("div", {"class": "t1jojoys dir dir-ltr"}) # title
    
    description = x.find("span", {"class": "t6mzqp7 dir dir-ltr"}) # Span description
    
    num_bed = x.find("span", {"class": "dir dir-ltr"}) # div num beds
  
    price_val = x.find("span", {"class": "_tyxjp1"}) # price_val

    price_per_night_data = x.find("span", {"class": "a8jt5op dir dir-ltr"}) # price per night

    price_per_night = " ".join(str(s) for s in price_per_night_data.contents)
    
    average_rating_num_rating = x.find("span", {"class": "r1dxllyb dir dir-ltr"}) # Average rating and num of ratings
    
    # Create lists of row entries, all added concurrently
    titles.append(title.contents[0])
    descriptions.append(description.contents[0])
    num_beds.append(num_bed.contents[0])
    price_vals.append(price_val.contents[0])
    price_per_nights.append(price_per_night)
    average_rating_num_ratings.append(average_rating_num_rating.contents[0])
    
# Bundle into one object and add to a dataframe
df_data = {
"title": titles,
"description": descriptions,
"num_beds": num_beds ,
"price_val": price_vals ,
"price_per_night": price_per_nights,
"average_rating_num_rating": average_rating_num_ratings
}


In [87]:
df = pd.DataFrame(df_data)
df

Unnamed: 0,title,description,num_beds,price_val,price_per_night,average_rating_num_rating
0,Private room in New York,Near Gramercy Park/ Private Entrance,1 double bed,,"$60 per night, originally $75",4.41 (115)
1,Hotel room in New York,"Staypineapple New York, Fashionista King",1 king bed,$157,"$157 per night, originally $196",4.74 (272)
2,Apartment in New York,UNTITLED at Freeman - Queen Studio 915,1 queen bed,$180,"$180 per night, originally $225",4.9 (42)
3,Private room in New York,spacious chelsea room,1 double bed,$77,$77 per night,4.9 (95)
4,Private room in New York,Minimal room 1 SofaBed + loft bed in Manhattan,2 beds,$68,"$68 per night, originally $75",4.57 (123)
5,Private room in New York,Chic Designer’s Room & 2 Beds in Manhattan LES,2 beds,$95,$95 per night,4.57 (355)
6,Private room in New York,Near Times Square and Hell's Kitchen,2 beds,$120,$120 per night,4.75 (576)
7,Apartment in New York,Guest Room with 1 King Bed at Holiday Inn New ...,1 king bed,$129,$129 per night,4.75 (4)
8,Private room in New York,LARGE BEDROOM CLOSE TO MANHATTAN,Jan 13 – 15,$47,$47 per night,4.67 (73)
9,Apartment in New York,Gramercy park Apartment (2beds),2 beds,$90,$90 per night,4.53 (110)
