In [1]:
from scrapy import Spider
from scrapy.crawler import CrawlerProcess

import logging
import os

In [2]:
def try_remove(filename):
    try:
        os.remove(filename)
    except OSError:
        pass

In [3]:
class BookingComOpinions(Spider):
    name = 'BookingComOpinionsSpider'
    
    custom_settings = {
        'FEED_URI': '%(json_file)s', 
        'FEED_FORMAT': 'json',
        'DOWNLOAD_DELAY': 3,
        'LOG_LEVEL': logging.DEBUG,
        'EXTENSIONS': {
            'scrapy.extensions.closespider.CloseSpider': 1
        },
        'CLOSESPIDER_ITEMCOUNT': 10
    }
    
    def parse(self, response):
        hotels = response.css('div.sr_item')
        for hotel in hotels:
            hotel_name = hotel.css('span.sr-hotel__name::text').get().strip()
            for hotel_link in hotel.css('.hotel_name_link.url'):
                yield response.follow(hotel_link, self.parse_hotel, meta={'hotel': hotel_name})
        
        for link in response.css('a[data-page-next]'):
            yield response.follow(link)
            
    def parse_hotel(self, response):
        hotel_name = response.meta['hotel']
        for reviews_link in response.css('a.show_all_reviews_btn'):
            yield response.follow(reviews_link, self.parse_reviews, meta={'hotel': hotel_name})
        
    def parse_reviews(self, response):
        hotel_name = response.meta['hotel']
        items = response.css('li.review_item')
        for item in items:
            publish_date = item.css('meta[itemprop="datePublished"]'
                                    '::attr(content)').get('')
            reviewer = item.css('div.review_item_reviewer')
            rev_count = reviewer.css('div.review_item_user_review_count::text').get('')
            
            review = item.css('div.review_item_review')
            rating = review.css('meta[itemprop="ratingValue"]'
                                    '::attr(content)').get('')
            raw_tags = review.css('li.review_info_tag::text').getall()
            tags = list(filter(None, map(str.strip, raw_tags)))

            yield {
                'hotel': hotel_name,
                'publish_date': publish_date,
                'rev_count': rev_count,
                'rating': rating,
                'tags': tags
            }
            
                    
        for next_page in response.css('a#review_next_page_link'):
            yield response.follow(next_page, self.parse_reviews,
                                 meta = response.meta)

In [4]:
ENTRY_URL = 'https://www.booking.com/searchresults.pl.html?ss=Podstrana'
json_file = 'hotels.json'
try_remove(json_file) 

In [5]:
process = CrawlerProcess()
process.crawl(BookingComOpinions, start_urls=[ENTRY_URL], json_file=json_file)

2019-04-10 23:48:34 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-10 23:48:34 [scrapy.utils.log] INFO: Versions: lxml 4.3.1.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 18.9.0, Python 3.7.2 (default, Dec 29 2018, 06:19:36) - [GCC 7.3.0], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Linux-4.9.0-8-amd64-x86_64-with-debian-9.8
2019-04-10 23:48:34 [scrapy.crawler] INFO: Overridden settings: {'CLOSESPIDER_ITEMCOUNT': 10, 'DOWNLOAD_DELAY': 3, 'FEED_FORMAT': 'json', 'FEED_URI': '%(json_file)s', 'LOG_LEVEL': 10}
2019-04-10 23:48:34 [scrapy.extensions.telnet] INFO: Telnet Password: ef9d2a3651d06c75
2019-04-10 23:48:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.closespider.CloseSpi

<Deferred at 0x7f6c0fc9b518>

In [6]:
process.start()

2019-04-10 23:48:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.booking.com/searchresults.pl.html?ss=Podstrana> (referer: None)
2019-04-10 23:48:38 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.booking.com/hotel/hr/sanantoniopodstrana.pl.html> from <GET https://www.booking.com/hotel/hr/sanantoniopodstrana.pl.html?from=searchresults%0A#hotelTmpl>
2019-04-10 23:48:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.booking.com/searchresults.pl.html?dest_id=-92454&dest_type=city&ss=Podstrana&offset=15> (referer: https://www.booking.com/searchresults.pl.html?ss=Podstrana)
2019-04-10 23:48:45 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.booking.com/hotel/hr/apartments-johnny.pl.html> from <GET https://www.booking.com/hotel/hr/apartments-johnny.pl.html?from=searchresults%0A#hotelTmpl>
2019-04-10 23:48:47 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.b

2019-04-10 23:50:42 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.booking.com/hotel/hr/apartments-mate-podstrana.pl.html> from <GET https://www.booking.com/hotel/hr/apartments-mate-podstrana.pl.html?from=searchresults%0A#hotelTmpl>
2019-04-10 23:50:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.booking.com/reviews/hr/hotel/apartments-9.pl.html> (referer: https://www.booking.com/hotel/hr/apartments-9.pl.html)
2019-04-10 23:50:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartments-9.pl.html>
{'hotel': 'Apartments 9', 'publish_date': '2018-08-31', 'rev_count': '\n3 opinie\n', 'rating': '8.8', 'tags': ['Wyjazd wakacyjny', 'Rodzina z małymi dziećmi', 'Apartament z widokiem na morze', '8 noclegów']}
2019-04-10 23:50:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartments-9.pl.html>
{'hotel': 'Apartments 9', 'publish_date': '2018-08-16', 'rev_count

2019-04-10 23:51:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartmani-nera.pl.html>
{'hotel': 'Apartments Nera', 'publish_date': '2018-06-09', 'rev_count': '\n5 opinii\n', 'rating': '10', 'tags': ['Wyjazd wakacyjny', 'W parze', 'Studio', '4 noclegi']}
2019-04-10 23:51:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartmani-nera.pl.html>
{'hotel': 'Apartments Nera', 'publish_date': '2018-07-17', 'rev_count': '\n38 opinii\n', 'rating': '10', 'tags': ['Rodzina z małymi dziećmi', '2 pokoje', '7 noclegów', 'Wysłana przez urządzenie mobilne']}
2019-04-10 23:51:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartmani-nera.pl.html>
{'hotel': 'Apartments Nera', 'publish_date': '2017-09-21', 'rev_count': '\n5 opinii\n', 'rating': '9.2', 'tags': ['Wyjazd wakacyjny', 'W parze', 'Studio z balkonem i widokiem na morze', '13 noclegów']}
2019-04-10 23:51:01 [scrapy.c

2019-04-10 23:51:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/split.pl.html>
{'hotel': 'Hotel Split', 'publish_date': '2017-06-12', 'rev_count': '\n50 opinii\n', 'rating': '5.8', 'tags': ['Wyjazd służbowy', 'Grupa', 'Pokój dwuosobowy typu Superior z 1 lub 2 łóżkami i widokiem na morze i balkonem', '2 noclegi']}
2019-04-10 23:51:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/split.pl.html>
{'hotel': 'Hotel Split', 'publish_date': '2017-10-09', 'rev_count': '\n3 opinie\n', 'rating': '9.5', 'tags': ['Wyjazd wakacyjny', 'W parze', 'Pokój Dwuosobowy typu Deluxe z widokiem na morze i tarasem', '5 noclegów']}
2019-04-10 23:51:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/split.pl.html>
{'hotel': 'Hotel Split', 'publish_date': '2017-05-08', 'rev_count': '\n32 opinie\n', 'rating': '8.8', 'tags': ['Wyjazd wakacyjny', 'Rodzina z małymi dziećmi', 'Pokój dwuosobowy 

2019-04-10 23:51:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.booking.com/reviews/hr/hotel/apartments-johnny.pl.html>
{'hotel': 'Apartments Johnny', 'publish_date': '2015-08-13', 'rev_count': '\n6 opinii\n', 'rating': '10', 'tags': ['Wyjazd wakacyjny', 'Grupa']}
2019-04-10 23:51:34 [scrapy.extensions.logstats] INFO: Crawled 26 pages (at 10 pages/min), scraped 66 items (at 66 items/min)
2019-04-10 23:51:35 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.booking.com/hotel/hr/apartment-rino.pl.html> from <GET https://www.booking.com/hotel/hr/apartment-rino.pl.html?from=searchresults%0A#hotelTmpl>
2019-04-10 23:51:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.booking.com/hotel/hr/apartments-mate-podstrana.pl.html> (referer: https://www.booking.com/searchresults.pl.html?dest_id=-92454&dest_type=city&ss=Podstrana&offset=15)
2019-04-10 23:51:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.booking.com/searchres

In [11]:
import pandas as pd;

In [8]:
df = pd.read_json('hotels.json')

In [12]:
df['rev_count'] = df['rev_count'].str.strip()
df['rev_count'] = df['rev_count'].str.strip(' opinea')
df['rev_count'] = df['rev_count'].astype(int)

In [14]:
subdf = df[['rating', 'tags']]

In [17]:
dum = subdf['tags'].map(lambda tags: '|'.join(tags)).str.get_dummies()

In [22]:
indicators = pd.concat([subdf['rating'], dum], axis=1)

In [24]:
melted = pd.melt(indicators, id_vars=['rating'])

In [28]:
positive = melted['value'] == 1

In [29]:
selected = melted[positive]

In [31]:
aggregated = selected.groupby(['variable'])['rating'].agg(['mean', 'count'])

In [32]:
aggregated.sort_values(by=['count']).tail(10)

Unnamed: 0_level_0,mean,count
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Pokój Dwuosobowy z balkonem i widokiem na morze,9.1,5
Apartament – parter,9.84,5
6 noclegów,9.25,6
2 noclegi,7.557143,7
7 noclegów,9.42,10
Grupa,8.672727,11
Wysłana przez urządzenie mobilne,8.9625,16
Rodzina z małymi dziećmi,9.557143,21
W parze,9.013793,29
Wyjazd wakacyjny,9.153226,62
