In [1]:
import requests
import re
import time
import math
import threading
import logging

from tqdm import trange, tqdm

from bs4 import BeautifulSoup
from IPython.display import clear_output

import csv
import json
import pprint
import pandas as pd

In [2]:
#Setting log

logging_format = '%(asctime)s : %(message)s'
logging.basicConfig(level=logging.INFO, format=logging_format, filename='myLog.log', filemode='w')

## Preparing for Crawling

In [3]:
my_headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko)\
                    Chrome/79.0.3945.88 Safari/537.36",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.8"
        }

In [4]:
#Getting the number of pages

html = requests.get('https://distiller.com/search?official_status=official', headers = my_headers)
bsObj = BeautifulSoup(html.text)

page_count = math.ceil(int(bsObj.find('span', 
{'class':'pagination-control__description'}).getText().strip().split(' ')[-2]) / 50) 

print(page_count)

166


## Link Crawler

In [8]:
url = 'https://distiller.com/search?official_status=official&page='
url_list = []

In [9]:
def getUrls(url):

    num = 1

    for page in trange(num, page_count+1):
        
        try:    
            html = requests.get(url + str(page))
            bsObj = BeautifulSoup(html.text)

            for link in bsObj.findAll('a', href = re.compile('^(/spirits/)')):
                if 'href' in link.attrs:
                    
                    #lock.acquire()
                    url_list.append('https://distiller.com{}'.format(link.attrs['href']))
                    #lock.release()
            
            logging.info('Parsed {} of {} pages'.format(num, page_count))
            
            #print('Parsed {} of {} pages'.format(num, page_count))
        
            num += 1
            #if num % 25 == 0:
            #    clear_output()

            time.sleep(1)

        except Exception as e:
            logging.exception(e)
            print(e)
            continue 


In [10]:
getUrls(url)

100%|██████████| 166/166 [05:51<00:00,  2.12s/it]


In [11]:
url_list = list(set(url_list))
logging.info('{} links saved'.format(len(url_list)))
print('{} links saved'.format(len(url_list)))

8297 links saved


In [12]:
new_url_list = list([i] for i in url_list)
new_url_list[1]

['https://distiller.com/spirits/appleton-special-white-rum']

### Exporting as CSV

In [13]:
url_list[0]

'https://distiller.com/spirits/wheatley-vodka'

In [14]:
#Outputing links as csv
with open('links.csv','w', newline ='') as file:

    writer = csv.writer(file, delimiter=',')
    writer.writerow(['url'])
    writer.writerows(new_url_list)

## Main Crawler

In [21]:
data = []
exec_count = 0

In [19]:
#Importing links from csv
url_list = []

with open('links.csv','r', newline ='') as file:
    rows = csv.reader(file)
    for row in rows:
        url_list.append(row[0])
        
url_list.remove('url')

In [22]:
def main(start=0, end=-1):    
    
    global exec_count
    global data

    for url in tqdm(url_list[start:end]):   
        
        while True:
            try:            
                spirit_info = {   
                    'name':'',
                    'type':'',
                    'brand_name':'',
                    'origin':'',
                    'cost_level':0,
                    'age':0,
                    'abv':0,
                    'expert_rating':0,
                    'average_user_rating':0,
                    'user_comments':0,
                    'description':'',
                    'tasting_notes':'',
                    'reviewer':'',
                    'flavor_profile':'',
                      }

                html = requests.get(url)
                bsObj = BeautifulSoup(html.text)

                #name
                spirit_info['name'] = bsObj.find('h1', {'itemprop':'name'}).string.strip()

                #type
                spirit_info['type'] = bsObj.find('h2', 
                    {'class':'ultra-mini-headline type'}).string.strip()

                #brand-name
                try:
                    spirit_info['brand_name'] = bsObj.find('h2', 
                        {'itemprop':'brand_name'}).string.strip().split(' // ')[0]
                except:
                    pass

                #origin
                try:
                    spirit_info['origin'] = bsObj.find('h2', 
                        {'itemprop':'brand_name'}).string.strip().split(' // ')[1]
                except:
                    spirit_info['origin'] = bsObj.find('h2', {'itemprop':'brand_name'}).string.strip()


                #cost-level
                cost_index = str(bsObj.find('div', {'class':'value'})).index('cost-')+5
                spirit_info['cost_level'] = int(str(bsObj.find('div', {'class':'value'}))[cost_index])

                #age
                try:
                    spirit_info['age'] = int(bsObj.find('li', 
                        class_='detail age').getText().strip().split(' ')[-1])
                except:
                    spirit_info['age'] = None

                #abv
                abv = bsObj.find('li', class_='detail abv').getText()[5:].strip()

                if abv.isnumeric():

                    try:
                        spirit_info['abv'] = float(abv)
                    except:
                        spirit_info['abv'] = int(abv)

                else:
                    try:
                        spirit_info['abv'] = abv
                    except:
                        spirit_info['abv'] = None

                #expert-rating
                try:
                    rating_index = str(bsObj.find('span', {'class':'expert-rating'})).index('>')+2
                    spirit_info['expert_rating'] = int(str(bsObj.find('span', 
                            {'class':'expert-rating'}))[rating_index:rating_index+2])

                except: 
                    spirit_info['expert_rating'] = None

                #average-user-rating
                try: 
                    spirit_info['average_user_rating'] = round(float(bsObj.find('span', 
                        {'itemprop':'ratingValue'}).string)*20, 2)
                except:
                    spirit_info['average_user_rating'] = None

                #user-comments
                try: 
                    spirit_info['user_comments'] = int(bsObj.find(('a','span'), {'class':'count'}).string)
                except:
                    spirit_info['average_user_rating'] = None

                #description
                try:
                    spirit_info['description'] = bsObj.find('p', {'itemprop':'description'}).string
                except:
                    spirit_info['description'] = None

                #tasting-notes
                try:
                    spirit_info['tasting_notes'] = bsObj.find('p', {'itemprop':'reviewBody'}).string.strip('"')
                except:
                    spirit_info['tasting_notes'] = None

                #reviwer
                try:
                    spirit_info['reviewer'] = bsObj.find('a', {'itemprop':'author'}).string.strip()
                except:
                    spirit_info['reviewer'] = None

                #flavor-profile          
                try:
                    flavor_data = str(bsObj.find('canvas',{'class':['js-flavor-profile-chart']}))
                    flavor_data_list = []

                    raw_text = flavor_data.split('{')[1].split('}')[0]
                    #print(raw_text)

                    word = ''

                    for letter in raw_text:

                        if letter.isalpha() and letter != '_':
                            word += letter

                        elif letter == '_':
                            word += letter

                        elif letter.isnumeric():
                            word += letter

                        else:
                            flavor_data_list.append(word)
                            word = ''

                    if flavor_data_list[-1] == '':
                        flavor_data_list.append(raw_text[-2:])

                    #print(flavor_data_list) 

                    flavors = []

                    for item in flavor_data_list:
                        if item != '':
                            flavors.append(item)

                    #print(flavors)

                    ## Save as dict
                    flavor_profile = dict(zip((i for i in flavors[0::2]), (int(i.strip()) for i in flavors[1::2])))        
                    spirit_info['flavor_profile'] = flavor_profile

                except:
                    spirit_info['flavor_profile'] = None

                #Add to a list      
                time.sleep(3)

                lock = threading.Lock()            
                lock.acquire()

                data.append(spirit_info)
                exec_count += 1

                lock.release()

                if  exec_count % 10 == 0:
                    clear_output()

                logging.info('Parsed {} of {} links'.format(exec_count , len(url_list)))
                #print('Parsed {} of {} links'.format(exec_count , len(url_list)))
                
                break

            except Exception as e:
                logging.exception(e)
                print(e)
                time.sleep(30)
                continue
            

## Multi-threading

In [23]:
#Dividing the urls into segments

seg = 10
seg_list=[]

for i in range(seg):
    seg_list.append(int(len(url_list)/seg*i))

seg_list.append(len(url_list))

print(seg_list)
print(len(seg_list))

[0, 827, 1654, 2481, 3308, 4135, 4962, 5789, 6616, 7443, 8271]
11


In [12]:
# #test
# data = []
# exec_count = 0

# main(0, 8118)

In [24]:
threads = []
num = len(seg_list)

for i in range(0, num-1):
    threads.append(threading.Thread(target = main, args = (seg_list[i], seg_list[i+1])))
    threads[i].start()

# 等待所有子執行緒結束
for index in range(0, num-1):
    threads[index].join()

logging.info("Done")
print("Done")

100%|██████████| 827/827 [1:16:19<00:00,  5.54s/it]

Done





In [26]:
print(len(data), type(data))

8271 <class 'list'>


In [27]:
for key in data[-1]:
    print('{}: \n {} \n'.format(key, data[-1][key]))

name: 
 Old Bones Bourbon 10 Year Reserve 

type: 
 Bourbon 

brand_name: 
 Backbone Bourbon 

origin: 
 Indiana (Bottled in Kentucky), USA 

cost_level: 
 4 

age: 
 None 

abv: 
 55.0 

expert_rating: 
 None 

average_user_rating: 
 72.2 

user_comments: 
 20 

description: 
 This limited release, high-rye bourbon from Backbone Bourbon Company was distilled in Lawrenceburg, Indiana (MGP-sourced) from a mash bill of 55% corn, 40% rye, and 5% barley. Old Bones Bourbon 10 Year Reserve was bottled in Bardstown, Kentucky at a high proof of 55% ABV, after aging a decade in new, charred American oak barrels. 

tasting_notes: 
 None 

reviewer: 
 None 

flavor_profile: 
 None 



### Exporting as CSV

In [47]:
fieldnames = list(data[0].keys())
print(fieldnames)

['name', 'type', 'brand_name', 'origin', 'cost_level', 'age', 'abv', 'expert_rating', 'average_user_rating', 'user_comments', 'description', 'tasting_notes', 'reviewer', 'flavor_profile']


In [72]:
#Output as csv

with open(f'distiller_{date[0]+date[1]+date[2]}.csv','w', newline ='') as file:
    
    writer = csv.DictWriter(file, fieldnames=fieldnames)   
    
    writer.writeheader()
    
    for item in data:
        writer.writerow(item)

### Exporting as JSON

In [29]:
#Convert to JSON
json_data = json.dumps(data, sort_keys=True, ensure_ascii=False, indent=4)

In [109]:
date = "".join(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()).split(' ')[0].split('-'))
print(date)

20200330


In [70]:
#Output as JSON
with open(f'distiller_{date}.json','w') as file:
    file.write(json_data)