### Import libraries

In [None]:
import os
import scrapy
import logging
from scrapy.crawler import CrawlerProcess

### Scrape Booking.com

In [None]:
cities = [
    "Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris",
    "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar",
    "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon",
    "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes",
    "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne",
    "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"
]

In [None]:
class BookingSpider(scrapy.Spider):
    
    name = "Booking"
    start_urls = ["https://www.booking.com/index.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQPoAQGIAgGoAgO4AqbJwIsGwAIB0gIkNmVmMzgzMDMtNzU2OS00Yjg1LTlkNTctY2YzZDE2YWRjMzkw2AIE4AIB;sid=02025d674e66a5c598e16c66ad1e8862;keep_landing=1;sb_price_type=total;sig=v1RAqdEvO5&"]
    
    count_hotel = 0
    count_city = -1
    
    def parse(self, response):
        self.count_city += 1
        
        return scrapy.FormRequest.from_response(
            response,
            formdata = {"ss": cities[self.count_city]},
            callback = self.after_search
        )
    
    def after_search(self, response):
        hotels = response.css("div._5d6c618c8")
        
        for hotel in hotels:
            self.count_hotel += 1
            
            fkey = self.count_city+1
            city = cities[self.count_city]
            name = hotel.css("div.fde444d7ef::text").get()
            href = hotel.css("a.fb01724e5b").attrib["href"]
            desc = hotel.css("div._29c344764 div._4abc4c3d5::text").get()
            scor = hotel.css("div.bd528f9ea6::text").get()
            
            yield scrapy.Request(
                href,
                callback=self.parse_page,
                meta={
                    "h_fkey": fkey,
                    "h_city": city,
                    "h_name": name,
                    "h_href": href,
                    "h_desc": desc,
                    "h_scor": scor
                }
            )
        
        if self.count_hotel < 30:
            try:
                next_page = response.url + "&offset=25"
            except KeyError:
                logging.info("Plus de pages. Crawling fini")
            else:
                yield response.follow(next_page, callback=self.after_search)
        
        else:
            if self.count_city < len(cities)-1:
                self.count_hotel = 0
                
                yield response.follow(
                    "https://www.booking.com/index.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQm4ARfIAQzYAQPoAQGIAgGoAgO4AqbJwIsGwAIB0gIkNmVmMzgzMDMtNzU2OS00Yjg1LTlkNTctY2YzZDE2YWRjMzkw2AIE4AIB;sid=02025d674e66a5c598e16c66ad1e8862;keep_landing=1;sb_price_type=total;sig=v1RAqdEvO5&",
                    callback=self.parse,
                    dont_filter=True
                )
            
            else:
                pass
    
    def parse_page(self, response):
        yield {
            "fkey": response.meta.get("h_fkey"),
            "city": response.meta.get("h_city"),
            "name": response.meta.get("h_name"),
            "hurl": response.meta.get("h_href"),
            "scor": response.meta.get("h_scor"),
            "desc": response.meta.get("h_desc"),
            "lati": response.css("a.jq_tooltip::attr(data-atlas-latlng)").get().split(",")[0],
            "long": response.css("a.jq_tooltip::attr(data-atlas-latlng)").get().split(",")[1]
        }

In [None]:
filename = "src/scrape_booking.json"

if filename in os.listdir("./"):
    os.remove(filename)

process = CrawlerProcess(
    settings={
        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
        "LOG_LEVEL": logging.INFO,
        "FEEDS": {filename: {"format": "json"}}
    }
)

process.crawl(BookingSpider)
process.start()