***Parsing script***

In [None]:
# Import of libraries

from bs4 import BeautifulSoup
import requests
import selenium
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time


# Options for selenium driver

options = webdriver.ChromeOptions()
options.headless = True


# List of countries & cities that are scraped

locations = {'Italy': ['Rome', 'Milan', 'Florence', 'Venice', 'Como'],
             'France': ['Paris', 'Nice', 'Monaco', 'Marseille'],
             'Netherlands': ['Amsterdam', 'Rotterdam', 'Gaaga', 'Maastricht'],
             'Belgium': ['Bruxelles', 'Ghent', 'Bruges'],
             'Spain': ['Barcelona', 'Madrid', 'Ibiza', 'Seville'],
             'Germany': ['Munich', 'Dresden', 'Bonn', 'Berlin'],
             'Austria': ['Vienna', 'Salzburg', 'Innsbruck'],
             'Hungary': ['Budapest'],
             'Czech-Republic': ['Prague', 'Brno'],
             'Slovakia': ['Bratislava'],
             'Croatia': ['Zagreb', 'Cavtat'],
             'Portugal': ['Lisbon', 'Porto'],
             'Great-Britain': ['London', 'York', 'Lancaster'],
             'Ireland': ['Dublin'],
             'Sweden': ['Stockholm', 'Gothenburg'],
             'Finland': ['Helsinki'],
             'Denmark': ['Copenhagen']
            }


# Starters for pagination

link_starters = []
for country, cities in locations.items():
    for value in cities:
        link_starters.append(f'https://www.airbnb.com/s/{value}--{country}/homes?adults=2')
        
        
# Pages

def build_links(url, listings_per_page=20, pages=15):
    links = [url]
    for i in range(1, pages):
        links.append(url + f'&items_offset={listings_per_page * i}')
    return links


# Main features extraction

def extract_main_features(listing):
    features = {}
    name = listing.find_all('meta')[0].get('content')
    url = listing.find_all('meta')[2].get('content')
    type_ = listing.find('div', 't1jojoys dir dir-ltr').get_text()
    price = listing.find('span', 'a8jt5op dir dir-ltr').get_text()
    try:
        rating = listing.find('span', 'r1dxllyb dir dir-ltr').get_text()
    except:
        rating = ''
    try:
        badge = listing.find('div', 't1mwk1n0 dir dir-ltr').get_text()
    except:
        badge = ''
        
    features['url'] = url
    features['name'] = name
    features['type'] = type_.split('in')[0]
    features['price'] = price
    features['rating'] = rating
    features['badge'] = badge
    features['district'] = type_.split()[-1]
    return features


# Additional features extraction

def append_additional_features(features_dict, browser):
    additional_features = browser.find_elements("xpath", "//li[@class='l7n4lsf dir dir-ltr']")
    features_dict['guests'] = additional_features[0].text
    features_dict['bedroms'] = additional_features[1].text
    features_dict['beds'] = additional_features[2].text
    features_dict['baths'] = additional_features[3].text
    return features_dict


# Dealing with pop-up windows

def check_exist_by_xpath(xpath, browser):
    try:
        time.sleep(7)
        WebDriverWait(browser, 7).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
    except:
        pass

    
# Creating dictionary with categorical variables

def create_categorical_variables(substring, split, input_):
    list_ = [x for x in input_ if substring in x]
    input_ = [x for x in input_ if substring not in x]
    cat_dict = {}
    for item in list_:
        name, value = item.split(split, maxsplit = 1)
        name = name.replace(':', '')
        cat_dict[name] = value.strip()
    values = [1.0 for item in input_]
    output_dict = dict(zip(input_, values))
    
    return cat_dict, output_dict


# Amenities extraction

def extract_amenities(features_dict, browser):
    WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'amenit')]"))).click()
    time.sleep(4)
    amenities = browser.find_elements("xpath", "//div[@class='_jro6t0']")
    amen = []
    for i in amenities:
        amen.append(i.text)
    amen = list(filter(None, amen))
    amen = [x for x in amen if not x.startswith('Unavailable') if not x.startswith('CHECK-IN') if not x.startswith('GUESTS')]
    splitted_variables, amenities_dictionary = create_categorical_variables('\n', '\n', amen)
    features_dict.update(amenities_dictionary)
    features_dict.update(splitted_variables)
    # Close amenities
    WebDriverWait(browser, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='_pa35zs']"))).click()
    time.sleep(4)
    return features_dict


# Rules extraction

def extract_rules(features_dict, browser, show_more):
    show_more[-4].click()
    time.sleep(4)
    rules = browser.find_elements("xpath", "//div[@class='c1r78wbb dir dir-ltr']")
    rul = []
    for i in rules:
        rul.append(i.text)
    checkin_dict, rules_dictionary = create_categorical_variables('Check-in', ' ', rul)
    features_dict.update(rules_dictionary)
    checkout_dict, rules_dictionary = create_categorical_variables('Checkout', ' ', rul)
    features_dict.update(dict(list(checkin_dict.items())[0: 1]))
    features_dict.update(checkout_dict)
    # Close rules
    WebDriverWait(browser, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='_pa35zs']"))).click()
    time.sleep(4)
    return features_dict


# Safety & property extraction

def extract_safety(features_dict, browser, show_more):
    show_more[-3].click()
    time.sleep(4)
    safety = browser.find_elements("xpath", "//div[@class='c1r78wbb dir dir-ltr']")
    saf = []
    for i in safety:
        saf.append(i.text)
    splitted_variables, safety_dictionary = create_categorical_variables('\n', '\n', saf)
    features_dict.update(safety_dictionary)
    features_dict.update(splitted_variables)
    return features_dict


# Creating dataframe

def form_dataframe(dictionary):
    df = pd.DataFrame(list(dictionary.items())).T
    df.columns = df.iloc[0]
    df = df[1:]
    return df


# Main function

def main():
    browser = webdriver.Chrome(executable_path = 'C:\Study\selenium_driver\chromedriver', options = options)
    df = pd.DataFrame()
    for start in link_starters:
        l = build_links(start)
        for page in l:
            soup = BeautifulSoup(requests.get(page).content, 'html.parser')
            listings = soup.find_all('div', 'c1l1h97y dir dir-ltr')
            for listing in listings:
                features = extract_main_features(listing)
                browser.get('https://' + features['url'])
                check_exist_by_xpath("//button[@aria-label='Close']", browser)
                features = append_additional_features(features, browser)
                features = extract_amenities(features, browser)
                shows = browser.find_elements("xpath", "//*[contains(text(), 'Show more')]")
                features = extract_rules(features, browser, shows)
                features = extract_safety(features, browser, shows)
                features_df = form_dataframe(features)
                frames = [df, features_df]
                df = pd.concat(frames, ignore_index=True)
                df.fillna(0.0, inplace = True)
    browser.close()
    df.to_csv(r'C:\Study\parsing_airbnb\airbnb_.csv')       
    
    return df


# Perform

df = main()

# Show the result

pd.set_option('display.max_columns', None)
df.head()

  options.headless = True
  browser = webdriver.Chrome(executable_path = 'C:\Study\selenium_driver\chromedriver', options = options)


Unnamed: 0,url,name,type,price,rating,badge,district,guests,bedroms,beds,baths,Hair dryer,Shampoo,Hot water,Free washer,Hangers,Iron,Central air conditioning,Heating,Smoke alarm,Carbon monoxide alarm,Fire extinguisher,First aid kit,Wifi,Refrigerator,Dishwasher,Free street parking,Paid parking off premises,Host greets you,Essentials,Lock on bedroom door,Kitchen,Cooking basics,Dishes and silverware,Elevator,Checkout before 10:00 AM,No parties or events,Check-in,Checkout,Carbon monoxide alarm installed,City skyline view,Courtyard view,Bidet,Free washer – In unit,TV,Coffee maker,Patio or balcony,Outdoor furniture,Self check-in,Bed linens,Security cameras on property,Dedicated workspace,Keypad,3 guests maximum,Check-in: 4:00 PM - 12:00 AM,Self check-in with keypad,Quiet hours,Cleaning products,Body soap,Shower gel,Extra pillows and blankets,Safe,Clothing storage: closet,"42"" HDTV",Window AC unit,Indoor fireplace: wood-burning,Freezer,Stove,Oven,Hot water kettle,Wine glasses,Toaster,Dining table,Paid parking garage off premises,Smart lock,Private entrance,Single level home,Luggage dropoff allowed,Long term stays allowed,2 guests maximum,Check-in after 3:00 PM,Checkout before 11:00 AM,Self check-in with smart lock,Pack ’n play/Travel crib - available upon request,AC - split type ductless system,Microwave,Stainless steel stove,Baking sheet,Flexible check-in,Room-darkening shades,Drying rack for clothing,Coffee maker: espresso machine,Coffee,Lockbox,4 guests maximum,Self check-in with lockbox,Washer,Dryer,Clothing storage: walk-in closet,Crib,Pack ’n play/Travel crib,High chair,Air conditioning,Coffee maker: Nespresso
0,www.airbnb.com/rooms/40634794?adults=2&childre...,Private room & private bathroom !10mins@ Termini,Private room,$48 per night,4.31 (81),,Esquilino,2 guests,· 1 bedroom,· 2 beds,· 1 private bath,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Towels, bed sheets, soap, and toilet paper",Private room can be locked for safety and privacy,Space where guests can cook their own meals,"Pots and pans, oil, salt and pepper","Bowls, chopsticks, plates, cups, etc.",The home or building has an elevator that’s at...,1.0,1.0,4:00 PM - 12:00 AM,before 10:00 AM,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,www.airbnb.com/rooms/14024883?adults=2&childre...,"A cozy quite room + balcony,10 mins to Colosseo!",Private room,$48 per night,4.51 (410),,Esquilino,3 guests,· 1 bedroom,· 2 beds,· 2 shared baths,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,"Towels, bed sheets, soap, and toilet paper",Private room can be locked for safety and privacy,Space where guests can cook their own meals,"Pots and pans, oil, salt and pepper","Bowls, chopsticks, plates, cups, etc.",The home or building has an elevator that’s at...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Cotton linens,In the corridor,In a common space,Check yourself into the home with a door code,1.0,1.0,1.0,11:00 PM - 7:00 AM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,www.airbnb.com/rooms/49807366?adults=2&childre...,Trevi Fountain Square View Luxury Apartment,Condo,$316 per night,4.91 (137),Superhost,Trevi,,· 1 bedroom,· 1 bed,· 1.5 baths,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,"Towels, bed sheets, soap, and toilet paper",0.0,Space where guests can cook their own meals,"Pots and pans, oil, salt and pepper","Bowls, chopsticks, plates, cups, etc.",0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,Cotton linens,0.0,In a room with a door,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Separate street or building entrance,No stairs in home,For guests' convenience when they have early a...,Allow stay for 28 days or more,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,www.airbnb.com/rooms/41933425?adults=2&childre...,"Lovely, Modern Apartment near St. Peter",Apartment,"$122 per night, originally $165",4.72 (112),,Trastevere,2 guests,· 1 bedroom,· 1 bed,· 1.5 baths,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,"Towels, bed sheets, soap, and toilet paper",0.0,Space where guests can cook their own meals,"Pots and pans, oil, salt and pepper","Bowls, chopsticks, plates, cups, etc.",0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,For guests' convenience when they have early a...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,www.airbnb.com/rooms/38451993?adults=2&childre...,Elegant and central apartment by the Vatican,Apartment,"$103 per night, originally $174",4.76 (86),,Prati,4 guests,· 1 bedroom,· 2 beds,· 1 bath,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,"Towels, bed sheets, soap, and toilet paper",0.0,Space where guests can cook their own meals,"Pots and pans, oil, salt and pepper","Bowls, chopsticks, plates, cups, etc.",0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,For guests' convenience when they have early a...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
