In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random

In [2]:
!pip install bs4

Collecting bs4
  Using cached bs4-0.0.1-py3-none-any.whl
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
Collecting soupsieve>1.2
  Using cached soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.11.1 bs4-0.0.1 soupsieve-2.3.2.post1


In [4]:
"""Extracts HTML from a webpage""" 
def scrape_page(page_url):
    answer = requests.get(page_url)
    soup = BeautifulSoup(answer.content, features='html.parser')
    return soup

In [5]:
"""
Return soup object from url_obj 
where url_obj is:
{page_num <int> : href <str>}
e.g {1: "www.airbnb.com/1/"}

"""
def extract_listing(url_obj):
    page_url = ""
    pag_key = ""
    for key in url_obj:
        page_url = url_obj[key]
        page_key = key
    """Extracts listings from an Airbnb search page"""
    page_soup = scrape_page(page_url)
    formatted_date =  datetime.fromtimestamp(datetime.now().timestamp())
    print("Extracted Data: {}--{}\t".format(page_url, formatted_date))
    return page_soup

In [6]:
soup = extract_listing({1: "https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST"})

Extracted Data: https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST--2022-12-04 00:23:29.463847	


In [7]:
"""
soup: page soup object
className: attribute id that has anchor tag
count: page number count we are currently extracting
"""
def extract_anchors(soup, className, count=0):       
    processed_classes = process_anchor(className, soup.find_all("a", {"class": className}), count) 
    print("Done extracting anchor {}".format(count))
    return processed_classes

def process_anchor(className, anchors, count=0):
    overall_data = {}
    for anchor in anchors:
        text, data = get_anchor_data(anchor) # Get anchor page number<str> and href value<str>
        if text == None and data == None:
            continue
        if count > 0 and text == str(count):
            overall_data[text] = data
        if count == 0:
            overall_data[text] = data
    print("OVERALL_DATA", overall_data)
    return overall_data

def get_anchor_data(anchor):
    anchor_text = anchor.get_text()
    if len(anchor_text) != 0 and anchor_text != "15":
        text = anchor.get_text()
        data = anchor.get("href")
        return text, data
    return None, None

In [8]:
class_name = "_833p2h"
anchor_data = extract_anchors(soup, class_name)

OVERALL_DATA {'2': '/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&search_type=AUTOSUGGEST&tab_id=home_tab&query=New%20York&price_filter_input_type=0&price_filter_num_nights=2&federated_search_session_id=b00169f9-041f-494c-8c12-430c4049d4f0&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MiwiaXRlbXNfb2Zmc2V0IjoyMCwidmVyc2lvbiI6MX0%3D', '3': '/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%

In [9]:
"""
Construct list of url objects containing url_obj
{page_num<int>: href<str>}

"""
def get_urls(anchor_data, urls):
    base_url = "https://www.airbnb.com"
    for key, value in anchor_data.items():
        url_obj = {}
        url_obj[key] = str(base_url + anchor_data[key]) 
        urls.append(url_obj)
    return urls

In [10]:
urls_list = [{1: "https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST"}] 
# Get urls 1-4
urls = get_urls(anchor_data, urls_list)
urls

[{1: 'https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST'},
 {'2': 'https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&fl

In [11]:
# Custom function to get url pages 5-14
def get_remaining_urls(urls):
    count = 5
    class_name=  "_833p2h"
    url_data = [urls[-1]]
    while count <= 15:
        count_soup = extract_listing(url_data[-1])
        count_anchor = extract_anchors(count_soup, class_name, count)
        url_data = get_urls(count_anchor, url_data)
        count+=1
    return url_data

In [12]:
# Get Remaining urls 5-14
remaining_urls = get_remaining_urls(urls)

Extracted Data: https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&search_type=AUTOSUGGEST&tab_id=home_tab&query=New%20York&price_filter_input_type=0&price_filter_num_nights=2&federated_search_session_id=b00169f9-041f-494c-8c12-430c4049d4f0&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MiwiaXRlbXNfb2Zmc2V0Ijo2MCwidmVyc2lvbiI6MX0%3D--2022-12-04 00:23:31.661713	
OVERALL_DATA {'5': '/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2

In [13]:
# Create list of all urls 1-14
all_urls = urls + remaining_urls[1:]
all_urls

[{1: 'https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST'},
 {'2': 'https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&fl

In [14]:
# Extract soup objects and query for div that has listing data
listing_pages = [extract_listing(urlObj).find_all("div", {"class": "g1qv1ctd cb4nyux dir dir-ltr"}) for urlObj in all_urls]

Extracted Data: https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=september&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&adults=0&children=0&infants=0&pets=0&search_type=AUTOSUGGEST--2022-12-04 00:23:50.654835	
Extracted Data: https://www.airbnb.com/s/New-York/homes?place_id=ChIJOwg_06VPwokRYv534QaPC8g&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=december&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%

In [15]:
# Returns object for all data where key=column_name, value=column_row_value
def get_df_data(listing_pages):
    a = []
    b = []
    c = []
    d = []
    e = []
    f = []
    for listings in listing_pages:
    # Containers used to store div data, clear for each listing
        for listing in listings:
            listing_data = extract_listing_data(listing) # Use helper func to map soup -> obj
            print("listing_data", listing_data)
            if listing_data == None:
                continue
            a.append(listing_data["title"])
            b.append(listing_data["description"])
            c.append(listing_data["num_bed"])
            d.append(listing_data["price_val"])
            e.append(listing_data["price_per_night"])
            f.append(listing_data["average_rating_num_rating"])
    df_data = {
        "title": a,
        "description": b,
        "num_beds": c ,
        "price_val": d ,
        "price_per_night": e,
        "average_rating_num_rating": f
    }
    return df_data

# Helper function for mapping soup data to obj for row data
def extract_listing_data(listing):
    listing_data = {}
    if listing.find("span", {"class": "_tyxjp1"}) == None:
        return None
    listing_data["title"] = listing.find("div", {"class": "t1jojoys dir dir-ltr"}).contents[0] # title
    listing_data["description"] = listing.find("span", {"class": "t6mzqp7 dir dir-ltr"}).contents[0] # Span description
    listing_data["num_bed"] = listing.find("span", {"class": "dir dir-ltr"}).contents[0] # div num beds
    listing_data["price_val"] = listing.find("span", {"class": "_tyxjp1"}).contents[0] # price_val
    price_per_night_data = listing.find("span", {"class": "a8jt5op dir dir-ltr"}).contents[0] # price per night
    listing_data["price_per_night"] = " ".join(str(s) for s in price_per_night_data)
    listing_data["average_rating_num_rating"] = listing.find("span", {"class": "r1dxllyb dir dir-ltr"}).contents[0] if listing.find("span", {"class": "r1dxllyb dir dir-ltr"}) else "NONE"  # Average rating and num of ratings
    return listing_data

In [16]:
# Extract all the data from the listing pages
df_data = get_df_data(listing_pages) 

listing_data None
listing_data {'title': 'Private room in New York', 'description': 'spacious chelsea room', 'num_bed': '1 double bed', 'price_val': '$77', 'price_per_night': '$ 7 7   p e r   n i g h t', 'average_rating_num_rating': '4.91 (98)'}
listing_data {'title': 'Private room in New York', 'description': 'Minimal room 4 SofaBed + loft bed in Manhattan', 'num_bed': '2 beds', 'price_val': '$79', 'price_per_night': '$ 7 9   p e r   n i g h t', 'average_rating_num_rating': '4.44 (117)'}
listing_data {'title': 'Private room in New York City', 'description': 'Comfortable and clean bedroom', 'num_bed': '1 double bed', 'price_val': '$65', 'price_per_night': '$ 6 5   p e r   n i g h t', 'average_rating_num_rating': '4.84 (151)'}
listing_data {'title': 'Private room in New York City', 'description': "Cozy, quiet room in artist's apartment", 'num_bed': '1 single bed', 'price_val': '$49', 'price_per_night': '$ 4 9   p e r   n i g h t', 'average_rating_num_rating': '4.78 (120)'}
listing_data 

In [55]:
# Load data into a data frame - NEED TO DO DATA CLEANING HERE BEFORE EXPORTING TO CSV!
df = pd.DataFrame(df_data)
df.head()

Unnamed: 0,title,description,num_beds,price_val,price_per_night,average_rating_num_rating
0,Private room in New York,spacious chelsea room,1 double bed,$77,$ 7 7 p e r n i g h t,4.91 (98)
1,Private room in New York,Minimal room 4 SofaBed + loft bed in Manhattan,2 beds,$79,$ 7 9 p e r n i g h t,4.44 (117)
2,Private room in New York City,Comfortable and clean bedroom,1 double bed,$65,$ 6 5 p e r n i g h t,4.84 (151)
3,Private room in New York City,"Cozy, quiet room in artist's apartment",1 single bed,$49,$ 4 9 p e r n i g h t,4.78 (120)
4,Private room in New York,[Manhattan Heart] 800m to Time Square/Broadway,1 bed,$126,$ 1 2 6 p e r n i g h t,4.91 (23)


In [56]:
df["title"] = df["title"].astype("string")
df["description"] = df["description"].astype("string")
df["num_beds"] = df["num_beds"].astype("string")

df["price_val_usd"] = df["price_val"].apply(lambda x: int(x[1:]))
df["price_val_usd"] = df["price_val_usd"].astype("int32")

In [57]:
def get_total_stars(x):
    left_parenthesis = x.find("(")
    if left_parenthesis == -1:
        return 0
    right_parenthesis = len(x) - 1
    total_star = float(x[:left_parenthesis-1])
    return total_star

def get_num_review(x):
    left_parenthesis = x.find("(")
    if left_parenthesis == -1:
        return 0
    right_parenthesis = len(x) - 1
    num_review = int(x[left_parenthesis+1: right_parenthesis])
    
    return num_review

In [58]:
df["total_star"] = df["average_rating_num_rating"].apply(get_total_stars)
df["num_review"] =   df["average_rating_num_rating"].apply(get_num_review).astype("int32")

In [59]:
df = df.drop(columns=["price_per_night", "average_rating_num_rating", "price_val"])
df

Unnamed: 0,title,description,num_beds,price_val_usd,total_star,num_review
0,Private room in New York,spacious chelsea room,1 double bed,77,4.91,98
1,Private room in New York,Minimal room 4 SofaBed + loft bed in Manhattan,2 beds,79,4.44,117
2,Private room in New York City,Comfortable and clean bedroom,1 double bed,65,4.84,151
3,Private room in New York City,"Cozy, quiet room in artist's apartment",1 single bed,49,4.78,120
4,Private room in New York,[Manhattan Heart] 800m to Time Square/Broadway,1 bed,126,4.91,23
...,...,...,...,...,...,...
207,Apartment in New York,🌟Elite Manhattan Studio with Gym and Roof deck🌟,1 bed,124,4.70,44
208,Apartment in New York,UNTITLED at Freeman - Double Studio 317,1 double bed,228,4.86,63
209,Private room in New York,Big Bedrm in Manhattan near Central Park & Subway,1 bed,79,4.86,22
210,Apartment in Brooklyn,Artsy Bsmnt Apt in Bklyn Brownstone,1 double bed,115,4.83,143


In [60]:
df.head()

Unnamed: 0,title,description,num_beds,price_val_usd,total_star,num_review
0,Private room in New York,spacious chelsea room,1 double bed,77,4.91,98
1,Private room in New York,Minimal room 4 SofaBed + loft bed in Manhattan,2 beds,79,4.44,117
2,Private room in New York City,Comfortable and clean bedroom,1 double bed,65,4.84,151
3,Private room in New York City,"Cozy, quiet room in artist's apartment",1 single bed,49,4.78,120
4,Private room in New York,[Manhattan Heart] 800m to Time Square/Broadway,1 bed,126,4.91,23


In [61]:
df[df["total_star"] == 0]

Unnamed: 0,title,description,num_beds,price_val_usd,total_star,num_review
23,Apartment in New York,Studio with 1 Queen Bed at Candlewood Suites T...,1 queen bed,109,0.0,0
39,Private room in New York,Cozy room at LOWER EAST SIDE,1 bed,88,0.0,0
45,Private room in New York,Cozy room at LOWER EAST SIDE,1 bed,88,0.0,0
54,Private room in Brooklyn,Private Room Near Major Transportation,1 bed,53,0.0,0
61,Apartment in New York,Adorable one bedroom in upper east side,1 bed,135,0.0,0
64,Apartment in New York,Studio Artitaje in Harlem,2 beds,93,0.0,0
66,Apartment in New York,Hell’s Kitchen 1BR Holiday Deal!,1 bed,143,0.0,0
78,Private room in New York City,Private Room Near Major Transportation,1 bed,53,0.0,0
79,Apartment in New York,Cozy Studio in the Heart of Midtown Manhattan!,1 bed,171,0.0,0
107,Apartment in New York,A Upper East Side Flat with a Home Office,1 bed,140,0.0,0


In [62]:
df[df["total_star"] == 0]

Unnamed: 0,title,description,num_beds,price_val_usd,total_star,num_review
23,Apartment in New York,Studio with 1 Queen Bed at Candlewood Suites T...,1 queen bed,109,0.0,0
39,Private room in New York,Cozy room at LOWER EAST SIDE,1 bed,88,0.0,0
45,Private room in New York,Cozy room at LOWER EAST SIDE,1 bed,88,0.0,0
54,Private room in Brooklyn,Private Room Near Major Transportation,1 bed,53,0.0,0
61,Apartment in New York,Adorable one bedroom in upper east side,1 bed,135,0.0,0
64,Apartment in New York,Studio Artitaje in Harlem,2 beds,93,0.0,0
66,Apartment in New York,Hell’s Kitchen 1BR Holiday Deal!,1 bed,143,0.0,0
78,Private room in New York City,Private Room Near Major Transportation,1 bed,53,0.0,0
79,Apartment in New York,Cozy Studio in the Heart of Midtown Manhattan!,1 bed,171,0.0,0
107,Apartment in New York,A Upper East Side Flat with a Home Office,1 bed,140,0.0,0


In [23]:
# # Export CSV - commented out temporarily
# count=1
# file_name = "airbnb_scraped_listings_{}.csv".format(random.getrandbits(128))
# pd.DataFrame(df_data).to_csv(file_name, encoding="utf-8")
# print("Generated_file: {}".format(file_name))