## I. Practice 'yield'

In [1]:
def practice():
    for i in range(10):
        print('hello')
        yield i

In [2]:
practice()

<generator object practice at 0x1084292a0>

In [3]:
list(practice())

hello
hello
hello
hello
hello
hello
hello
hello
hello
hello


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [4]:
for q in practice():
    print(q)

hello
0
hello
1
hello
2
hello
3
hello
4
hello
5
hello
6
hello
7
hello
8
hello
9


## II. Implement Scrapy

In [5]:
import scrapy
from scrapy.crawler import CrawlerProcess


class TouristTheSpider(scrapy.Spider):
    name = "Tourist"
    
    
    start_urls = [
        'https://www.tourradar.com/deals/'
        ]
    def parse(self, response):
        links= response.xpath('//div[contains(@class, "con")]//a[@class="pad"]/@href').extract()
        for link in links:
            #crawl the first ten pages only
            for number in range(1, 10, 1):
                base_url = 'https://www.tourradar.com'
                page_url = base_url + link +'?page=' +str(number)
                request = scrapy.Request(page_url, callback=self.parse_page)
                yield request
                
    def parse_page(self, response):

        # Identifying the information we want from each page
        days = response.xpath('//div[contains(@class, "br ")]//dd[contains(@class,"mob")]/text()').extract()
        texts = response.xpath('//div[contains(@class, "bm")]//h4[contains(@itemprop,"name")]/text()').extract()
        savings = response.xpath('//div[contains(@class, "br ")]//dd[contains(@class,"reg")]/text()').extract()
        prices = response.xpath('//div[contains(@class, "br ")]//span[contains(@class,"prv")]/text()').extract()
        reviews = response.xpath('//div[contains(@class, "bm")]//a[contains(@class, "blank tourLink")]//span/text()').extract()
       
        #there is only one headline per page
        country = response.xpath('//div[contains(@class, "stat")]//h2/text()').extract_first()
                # Yield a dictionary with the values we want.
            
        
        for i in range(len(texts)):
            day = days[i]
            text = texts[i]
            saving = savings[i]
            price = prices[i]
            review = reviews[i]
            out = {'Headline': text,
                   'Duration': day,
                   'Price': price,
                   'Savings': saving,
                    'Reviews': review,
                    'Region': country}
            yield out 
                   

                    
                    
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'deals_spider.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Inna Munroe\'s Crawler (innafomina43@gmai.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,    

})
                                         

# Starting the crawler with our spider.
process.crawl(TouristTheSpider)
process.start()
print('Done!')

Done!


In [6]:
import pandas as pd


deals=pd.read_json('deals_spider.json', orient='records')

In [7]:
#convert it our data to a dataframe
df = pd.DataFrame(deals)

In [8]:
df.shape

(748, 6)

In [13]:
df.head(50)

Unnamed: 0,Duration,Headline,Price,Region,Reviews,Savings
0,13,Myanmar Experience,983,\n Page 3 of 250+ Asia Special Offers\n,9 reviews - Excellent,\n $382\n ...
1,14,Discover Japan,3631,\n Page 2 of 250+ Asia Special Offers\n,20 reviews - Good,\n $819\n ...
2,10,One Life Adventures - Philippines,990,\n 250+ Asia Special Offers\n,873 reviews - Excellent,\n $95\n ...
3,10,Bali Bucket List - 10 Day Tour #TickItOffWithUs,677,\n Page 3 of 250+ Asia Special Offers\n,29 reviews - Excellent,\n $73\n ...
4,11,Epic Japan,2121,\n Page 3 of 250+ Asia Special Offers\n,8 reviews - Excellent,\n $478\n ...
5,15,Best of Myanmar,2056,\n Page 3 of 250+ Asia Special Offers\n,8 reviews - Good,\n $199\n ...
6,15,Everest Base Camp,1190,\n Page 3 of 250+ Asia Special Offers\n,135 reviews - Excellent,\n $360\n ...
7,6,Explore Golden Triangle,406,\n Page 3 of 250+ Asia Special Offers\n,19 reviews - Excellent,\n $39\n ...
8,11,Vietnam Traveller,665,\n Page 3 of 250+ Asia Special Offers\n,6 reviews - Excellent,\n $150\n ...
9,15,Essential India,1264,\n Page 3 of 250+ Asia Special Offers\n,14 reviews - Good,\n $285\n ...


In [14]:
df.tail(50)

Unnamed: 0,Duration,Headline,Price,Region,Reviews,Savings
698,14,Jakarta to Ubud,1745,\n Page 7 of 250+ Asia Special Offers\n,5 reviews - Good,\n $275\n ...
699,15,Unforgettable India,1391,\n Page 7 of 250+ Asia Special Offers\n,40 reviews - Excellent,\n $134\n ...
700,14,Iran Adventure,3239,\n Page 7 of 250+ Asia Special Offers\n,20 reviews - Excellent,\n $731\n ...
701,10,Southern India Tour with Beaches and Houseboat...,838,\n Page 7 of 250+ Asia Special Offers\n,14 reviews - Excellent,\n $62\n ...
702,13,Mountains & Mystics of India,1213,\n Page 7 of 250+ Asia Special Offers\n,14 reviews - Excellent,\n $117\n ...
703,15,China Active Adventure,1631,\n Page 7 of 250+ Asia Special Offers\n,5 reviews - Good,\n $368\n ...
704,14,Bangkok To Singapore Adventure,2012,\n Page 7 of 250+ Asia Special Offers\n,23 reviews - Excellent,\n $405\n
705,9,Best of Java,978,\n Page 7 of 250+ Asia Special Offers\n,2 reviews - Excellent,\n $221\n ...
706,27,Big Indochina Adventure (23 destinations),2869,\n Page 6 of 250+ Asia Special Offers\n,25 reviews - Excellent,\n $578\n
707,8,Classic Bali,733,\n Page 6 of 250+ Asia Special Offers\n,8 reviews - Excellent,\n $165\n ...
