# Task
#### scrape japanese whiskies from [Whisky exchange japanese](https://www.thewhiskyexchange.com/c/35/japanese-whisky)


## Imports

In [58]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import csv
import os
import re
%config Completer.use_jedi= False

## Catalog class

In [6]:
class Catalog:
    def __init__(self, link):
        self.link = link
        self.soup = self.get_soup()
        self.whisky_links = self.get_links()
        
    def get_soup(self):
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
        req = requests.get(self.link, headers=headers)
        soup = BeautifulSoup(req.text)
        return soup
    
    def get_links(self):
        atags = self.soup.select('a.product-card')
        base_url = 'https://www.thewhiskyexchange.com'
        links = [base_url+atag['href'] for atag in atags]
        return links

## Whisky class

In [54]:
class Whisky:
    def __init__(self, link):
        self.link = link
        self.soup = self.get_soup()
        self.info = self.get_info()
        
    def get_soup(self):
        headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
        }
        req = requests.get(self.link, headers=headers)
        soup = BeautifulSoup(req.text)
        return soup
        
    def get_info(self):
        name = self.soup.select_one('.product-main__name').get_text(strip=True)
        whisky_type = self.soup.select_one('.product-main__meta').get_text(strip=True)
        concentration = self.soup.select_one('.product-main__data').get_text(strip=True)
        price = self.soup.select_one('.product-action__price').get_text(strip=True)
        
        try:
            description = self.soup.select_one('.product-main__description p').get_text(strip=True)
        except:
            description = 'no description'
        
        try:
            rating = self.soup.select_one('.review-overview__content span').get_text(strip=True)
        except:
            rating = 'no rating'
        
        try:
            review_count_text = self.soup.select_one('.review-overview__count').get_text()
            review_count = re.search(r'(\d+)\xa0', review_count_text).group(1)
        except:
            review_count = '0'
            
        whisky = {
            'name':name,
            'whisky_type':whisky_type,
            'concentration':concentration,
            'rating':rating,
            'review_count':review_count,
            'price':price,
            'description':description
            }
        
        return whisky
    
    def save_image(self, path):
        img_link = self.soup.select_one('img.product-main__image')['src']
        with open(path+'/'+self.info['name']+'.jpg', mode='wb') as f:
            img = requests.get(img_link)
            f.write(img.content)
        

In [56]:
# I will need to use csv.dictwriter
CSV_FILE_NAME = 'Whisky.csv'
f = open(CSV_FILE_NAME, mode='w', newline='', encoding='utf-8')

# this list contains name of all columns of whisky.csv file
keys = ['name', 'whisky_type', 'concentration', 'rating', 'review_count', 'price', 'description']

# create csv writer object and write column names
writer = csv.DictWriter(f, keys)
writer.writeheader()

70

In [57]:
# create images folder
os.mkdir('images')

for page_num in range(1,4):
    catalog_url = f'https://www.thewhiskyexchange.com/c/35/japanese-whisky?pg={page_num}'
    catalog = Catalog(catalog_url)
    links = catalog.whisky_links
    
    # create whisky objects concurrently
    with concurrent.futures.ThreadPoolExecutor() as executor:
        whisky_objects = list(executor.map(Whisky, links))
        
    # output data of each whisky into whisky.csv file
    for whisky in whisky_objects:
        writer.writerow(whisky.info)
        
    print(f'Page {page_num} Completed info ✔', end=' ')
    
    # save images concurrently
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(lambda x: x.save_image('images'), whisky_objects)
    
    print('images ✔')
    
        
f.close()

Page 1 Completed info ✔ images ✔
Page 2 Completed info ✔ images ✔
Page 3 Completed info ✔ images ✔
