In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
class BookingScrapper():
  """"Class to scrap booking reviews"""
  def __init__(self, hotel_id, n_reviews, country_code):
    self.hotel_id = hotel_id
    self.n_reviews = n_reviews
    self.base_url = "https://www.booking.com/reviewlist.es.html?cc1=" + country_code + "&dist=1&pagename=" + hotel_id + "&type=total&offset=" # Reviews' endpoint
    self.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }

  def scrap(self):
    country_list = []
    positive_text_list = []
    negative_text_list = []

    for page in range(0, self.n_reviews+1, 25):
      url = self.base_url + str(page)
      page = requests.get(url, headers = self.headers)
      print(url)
      if (page.status_code == 200): # Connection succesful
        soup = BeautifulSoup(page.text) # Beautiful Soup to scrap the information
        review_blocks = soup.select("div .bui-grid")

        for review_block in review_blocks:
          country, positive_text, negative_text = self.scrap_review_block(review_block)
          if (country != None):
            country_list.append(country)
            positive_text_list.append(positive_text)
            negative_text_list.append(negative_text)
    
    return country_list, positive_text_list, negative_text_list

  def scrap_review_block(self, review_block):
    country = review_block.find("span", class_ = "bui-avatar-block__subtitle")  # Country of the person who wrote the review
    positive_text = ""
    negative_text = ""
    
    if (country != None):
      country = review_block.find("span", class_ = "bui-avatar-block__subtitle").text
      country = country.replace("\n", "")
      reviews = review_block.find_all("span", class_="c-review__body")
      
      if (len(reviews) >= 2):
        positive_text = reviews[0].text
        negative_text = reviews[1].text
  
    return country, positive_text, negative_text  # Each review has a positive text and a negative one


### Castellano

In [None]:
hoteles_esp = ["gran-luna-de-granada", "be-free-granada", "saray", "carmengranada", "andaluciacenter", "alixares",
"nazaries-business-spa", "macia-real-de-la-alhambra", "puerta-de-los-aljibes", "donjuan", "corona-de-granada",
"riu-plaza-espana", "fiesta-gran-colon", "nuevomadrid", "chamartin", "confortel-alcala-norte", "courtyardprincesa", 
"agumarmadrid", "confortel-atrium", "confortelpioxii", "castillaplaza", "silken-ciudad-de-gijon","principe-de-asturias",
"suite-1907-gijon","blue-santa-rosa","hernan-cortes","gijon","apartamento-puerto-deportivo-gijon","parador-de-gijon-molino-viejo",
"acomar","begona","spa-senator-cadiz","tatan","apartamentos-tandem-torres-de-cadiz","tandem-palacio-veedor-de-galeras","casa-el-consejero-de-indias",
"atico-falla","la-casa-del-consul","plazas-de-ca-diz-apartamentos","candelaria10","apartamento-casa-patio-del-panadero"] # We get these IDs manually from Booking.com

country = []
positive = []
negative = []

for hotel in hoteles_esp:   # We loop every hotel in the list to get the data
    bs_esp = BookingScrapper(hotel_id = hotel, n_reviews =  5000, country_code = "es")
    c, p, n = bs_esp.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
df_esp = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_esp.Positiva != "", df_esp.Negativa != ""))
df_esp.loc[mask_not_empty].to_excel("espana.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')

### Mexicano

In [None]:
hoteles_mex = ["camino-real-aeropuerto-mexico", "galeria-plaza-mexico-city", "intercontinental-presidente-mexico-city",
"cityexpress-el-angel", "courtyard-by-marriott-mexico-city-airport", "geneve-cd-de-mexico", "siesta-express", 
"nh-t2-aeropuerto-mexico", "casa-decu", "grand-prix", "wyndham-garden-polanco", "camino-real-mexico", 
"sheraton-maria-isabel-towers", "we-hotel-aeropuerto", "holiday-inn-express-suites-monterrey-valle", 
"hilton-garden-inn-monterrey-obispado", "city-express-plus-monterrey-nuevo-sur", "nh-monterrey", "kavia-monterrey", 
"holiday-inn-monterrey-valle", "fiesta-inn-monterrey-fundidora", "riu-plaza-guadalajara", "wyndham-garden-guadalajara-acueducto",
"camino-real-guadalajara", "one-guadalajara-centro-historico", "velvet-plaza", "hangar-inn", "krystal-urban-guadalajara", 
"plaza-diana", "one-guadalajara-periferico-vallarta", "presidente-intercontinental-guadalajara", "hampton-inn-by-hilton-guadalajara-aeropuerto",
"quinta-real-guadalajara", "fiesta-americana-grand-guadalajara-country-club", "hyatt-regency-andares-guadalajara", "la-estancia-tapataa", 
"selina-zone", "ocean-view-apartment-yal-mak-an", "aloft-cancun", "fairfield-inn-suites-by-marriott-cancun-airport", 
"majestic-elegance-playa-mujeres", "casamagna-cancun-marriott-resort", "senor-frogs-hostel"]

country = []
positive = []
negative = []

for hotel in hoteles_mex:
    bs_mex = BookingScrapper(hotel, 2500, "mx")
    c, p, n = bs_mex.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
df_mex = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_mex.Positiva != "", df_mex.Negativa != ""))
df_mex.loc[mask_not_empty].to_excel("mexico.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')

### Austral

In [None]:
hoteles_arg = ["dazzler-tower-recoleta","hilton-buenos-aires","broadway-suites","dazzler-palermo","awwa-suites-spa","thames-suites-buenos-aires","pestana-buenos-aires","duque-boutique",
"nh-latino","hermoso-departamento-tres-ambientes-en-belgrano","plaza-francia","arc-recoleta-boutique-amp-spa","nh-city","recoleta-caba"]

country = []
positive = []
negative = []

for hotel in hoteles_arg:
    bs_arg = BookingScrapper(hotel, 3250, "ar")
    c, p, n = bs_arg.scrap()

    country  += c
    positive += p
    negative += n



hoteles_urg = ["dazzler-montevideo", "costanero-montevideo-mgallery", "hampton-by-hilton-montevideo-carrasco", "ibis-montevideo-montevideo", 
"nh-columbia-montevideo", "hyatt-centric-montevideo", "pocitos-plaza", "soro-montevideo-curio-collection-by-hilton", "armon-suites", "bit-design",
"mercure-montevideo-punta-carretas", "cala-di-volpe-boutique", "regency-golf"]

for hotel in hoteles_urg:
    bs_urg = BookingScrapper(hotel, 3250, "uy")
    c, p, n = bs_urg.scrap()

    country  += c
    positive += p
    negative += n



hoteles_par = ["esplendor-by-wyndham-asuncion", "dazzler-asuncion", "holiday-inn-express-asuncion-aviadores", "resort-yacht-y-golf-club-paraguayo", 
"sheraton-asuncion", "the-hub", "de-las-torres", "la-mision-boutique", "la-alondra-factoria", "best-location-dpto-de-2-habitaciones-en-asuncion",
"casino-guarani-esplendor", "pantanal-inn", "asuncion-rent-suites", "granhotelpy", "villa-floreal-boutique", "crowne-plaza-asuncion"]

for hotel in hoteles_par:
    bs_par = BookingScrapper(hotel, 3250, "py")
    c, p, n = bs_par.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
df_aus = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_aus.Positiva != "", df_aus.Negativa != ""))
df_aus.loc[mask_not_empty].to_excel("austral.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')

### Andino

In [None]:
country = []
positive = []
negative = []

hoteles_ecu = ["arte", "montanita-chill-out-house-montanita", "stubel-suites-cafe", "colonial-house-quito", "juana-de-arco", "hostal-la-rosa", "wyndham-quito-airport", "hostal-rosita-latacunga", "hostal-kundalini", "oro-verde-manta", "hostal-la-orquidea", "crespo", "selina-quito", "hostal-los-nevados", "four-points-by-sheraton-cuenca", "airport"]

for hotel in hoteles_ecu:
    bs_ecu = BookingScrapper(hotel, 1500, "ec")
    c, p, n = bs_ecu.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
country = []
positive = []
negative = []

hoteles_ecu = ["finlandia-quito", "sheraton-quito-quito", "reina-isabel", "jw-marriott-quito", "dann-carlton-quito", "swissotel-quito", 
"ibis-quito", "plaza-grande", "radisson-quito", "hilton-colon-quito", "adamas-house-quito1", "la-casona-de-la-ronda", "the-penthouse-lodge", 
"wyndham-garden-quito", "old-town-quito-suites-quito"] + [
"casadelaspena", "wyndham-guayaquil", "grand-guayaquil", "sonesta-guayaquil", "unipark",
"radisson-guayaquil", "hilton-colon-guayaquil", "sonesta-guayaquil", "radisson-guayaquil"]

for hotel in hoteles_ecu:
    bs_ecu = BookingScrapper(hotel, 1500, "ec")
    c, p, n = bs_ecu.scrap()

    country  += c
    positive += p
    negative += n



hoteles_per = ["ramada-costa-del-sol-lima-airport", "trendy-host-mid-miraflores", "nm-lima-lima", "iberostar-selection-miraflores", 
"jw-marriott-lima", "waterfront-miraflores-next-to-marriott", "casa-republica-barranco", "belma-boutique-bed-and-breakfast", 
"costa-del-sol-wyndham-salaverry", "ac-lima-miraflores", "la-luna-inn", "aloft-lima-miraflores", "pullman-lima-miraflores", 
"country-club-lima", "courtyard-by-marriott-lima-miraflores", "arts-boutique-b"] + [
    "quechua-hostal", "peramada-by-wyndham-costa-del-sol-cusco", "sonesta-cusco", "amaru-hostal-i", "cusco-cusco", "antigua-casona-san-blas",
    "casa-matara-cusco", "quechua-san-blas-ii", "casa-andina-private-collection-cusco", "hostal-el-triunfo" , "ankawa", "waynapicchu"
] + [
    "heidinger", "la-hosteria", "diamond", "akas-apartments", "el-refugio-de-vichayito", "amp-casino-boulevard", 
    "el-mariscal-cusco", "casona-plaza", "loki-del-mar", "del-pilar-miraflores", "monte-real", "cozy-room-cusco"
]


for hotel in hoteles_per:
    bs_per = BookingScrapper(hotel, 1500, "pe")
    c, p, n = bs_per.scrap()

    country  += c
    positive += p
    negative += n



hoteles_bol = ["terrandes", "sagarnaga", "york-b-amp-b", "jumari", "de-sal-casa-andina", "hostal-la-casa-del-sol",
"onkel-inn-wagon-sleepbox-uyuni", "jardines-de-uyuni", "la-siesta", "residencial-ikandire-ii", "wild-rover-la-paz", 
"360-grados", "la-casona-boutique", "radisson-santa-cruz", "the-adventure-brew-b-amp-b"] + [
    "kulturberlin", "apart-regina", "cesar-s-plaza", "moraine", "la-siesta", "america-santa-cruz-de-la-sierra", "camino-real",
] + ["mi-pueblo-samary-boutique", "ritz-apart", "americana", "las-brisas", 
"hostal-ggranny", "selina-la-paz", "utama", "luxstone-executive-amp-suites", "le-ciel-d-uyuni"]

for hotel in hoteles_bol:
    bs_bol = BookingScrapper(hotel, 1500, "bo")
    c, p, n = bs_bol.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
df_and_original = pd.read_excel("andino.xlsx")

df_and = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_and.Positiva != "", df_and.Negativa != ""))
df_and = df_and.loc[mask_not_empty]
df_and = pd.concat([df_and, df_and_original]).drop_duplicates()

df_and.to_excel("andino.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')

### Caribeño

In [None]:
country = []
positive = []
negative = []

hoteles_dom = ["barcelo-bavaro-palace-deluxe", "bavaro-princess-all-suites-resort-spa-casino", "barcelo-punta-cana", 
"catalonia-santo-domingo", "catalonia-punta-cana", "be-live-punta-cana", "weston-suites", "boca-chica-calle-duarte-numero-1", 
"occidental-grand-punta-cana", "be-live-grand-marien", "catalonia-royal-bavaro", "majestic-colonial-punta-cana", "alsol-luxury-village-all-inclusive", 
"hard-rock-hotel-and-casino-punta-cana-all-inclusive"]

for hotel in hoteles_dom:
    bs_dom = BookingScrapper(hotel, 3250, "do")
    c, p, n = bs_dom.scrap()

    country  += c
    positive += p
    negative += n



hoteles_pr = ["best-western-plus-condado-palm-inn-amp-suites", "hyatt-place-san-juan-city-center", "coral-by-the-sea", 
"acacia-boutique", "la-concha-renaissance-san-juan-resort", "hyatt-house-san-juan", "courtyard-isla-verde-beach-resort", 
"san-juan-airport-and-casino", "atwindchimesinn", "san-juan-marriott-resort-and-stellaris-casino", "condado-vanderbilt-san-juan2", 
"sheraton-old-san-juan-casino", "el-canario-inn-condado", "embassy-suites-san-juan-casino", "condado-plaza-hilton", "tryp-by-wyndham-isla-verde", 
"san-juan-plaza.es"] + ["boho-beach-club", "casa-grande-mountain-retreat", "alojamientos-casa-de-campo", "howard-johnson-ponce", 
"tropica-beach", "comfort-inn-suites-san-juan", "perichis", "que-chevere", "hyatt-place-bayamon", "four-points-by-sheraton-caguas-real", 
"hyatt-place-manati", "embassy-suites-dorado-del-mar-beach-golf-resort", "hyatt-hacienda-del-mar",
 "wyndham-grand-rio-mar-beach-resort-amp-spa", "hatillo-road-2-kilometer-84", "arecibo-inn", "candelero-beach-resort", "san-juan-plaza"]


for hotel in hoteles_pr:
    bs_pr = BookingScrapper(hotel, 3250, "pr")
    c, p, n = bs_pr.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
country = []
positive = []
negative = []

hoteles_pr = ["boho-beach-club", "casa-grande-mountain-retreat", "alojamientos-casa-de-campo", "howard-johnson-ponce", 
"tropica-beach", "comfort-inn-suites-san-juan", "perichis", "que-chevere", "hyatt-place-bayamon", "four-points-by-sheraton-caguas-real", 
"hyatt-place-manati", "embassy-suites-dorado-del-mar-beach-golf-resort", "hyatt-hacienda-del-mar",
 "wyndham-grand-rio-mar-beach-resort-amp-spa", "hatillo-road-2-kilometer-84", "arecibo-inn", "candelero-beach-resort", "san-juan-plaza"]


for hotel in hoteles_pr:
    bs_pr = BookingScrapper(hotel, 1500, "pr")
    c, p, n = bs_pr.scrap()

    country  += c
    positive += p
    negative += n    

In [None]:
df_car_original = pd.read_excel("caribeno.xlsx")

df_car = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_car.Positiva != "", df_car.Negativa != ""))
df_car = df_car.loc[mask_not_empty]
df_car = pd.concat([df_car, df_car_original]).drop_duplicates()

df_car.to_excel("caribeno.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')

### Chileno

In [None]:
country = []
positive = []
negative = []

hoteles_chile = ["nodo", "novapark", "blue-tree-hotels-fundador", "brasilia-santiago", "hub-providencia", "suagon-suites-santiago",
"apart-b", "apart-fraga", "ibis-budget-providencia", "crowne-plaza-santiago", "magnolia", "lindo-departamento-a-pasos-del-centro-comercial-costanera-center",
"ibis-santiago-manquehue-norte", "arauco-studio-apartment", "santiago-park-plaza", "estudio-en-el-golf", "capital-bellet", "barcela3-suites",
"nh-collection-casacostanera", "director-vitacura", "holiday-inn-santiago-airport", "kennedy-apartments-los-militares", "santiago-rent-apart-santiago",
"lotus-blanc-studio-w-mountain-view-pool-and-gym"] + ["180-boutique", "novapark", "alto-del-sol-mejillones-express", "casablanca-spa-amp-wine", 
"keo-ovalle-casino-resort", "limari", "casa-wilson", "gran-germania"]


for hotel in hoteles_chile:
    bs_chl = BookingScrapper(hotel, 1500, "cl")
    c, p, n = bs_chl.scrap()

    country  += c
    positive += p
    negative += n

In [None]:
df_chl_original = pd.read_excel("chileno.xlsx")

df_chl = pd.DataFrame(zip(country, positive, negative), columns=["País", "Positiva", "Negativa"])
mask_not_empty = (np.logical_or(df_chl.Positiva != "", df_chl.Negativa != ""))
df_chl = df_chl.loc[mask_not_empty]
df_chl = pd.concat([df_chl, df_chl_original]).drop_duplicates()

df_chl.to_excel("chileno.xlsx", encoding="utf-8", index = None, engine='xlsxwriter')