## Import Module

In [1]:
import requests
import queue
import threading
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import pickle
import pandas as pd
import time

## Build a Job Queue

In [2]:
base_url_dict = {
    "japan_refrigerator": "https://www.dcity.com.tw/Large/1904020047/?P=1",
    "refrigerator": "https://www.dcity.com.tw/Large/1904020048/?P=1",
    "freezer": "https://www.dcity.com.tw/Large/1904020049/?P=1",
    "upright_washing_machine": "https://www.dcity.com.tw/Large/1904020050/?P=1",
    "drum_washing_machine": "https://www.dcity.com.tw/Large/1904020051/?P=1",
    "cloth_dryer": "https://www.dcity.com.tw/Large/1904020052/?P=1",
    "vacuum": "https://www.dcity.com.tw/Life/1904020020/?P=1"
}

In [3]:
job_queue = queue.Queue()

In [4]:
for item in base_url_dict.items():
    job_queue.put(item)

## Build Worker 

In [5]:
class Worker(threading.Thread):
    
    def __init__(self, job_queue):
        super().__init__()
        
        self.key = None
        self.url = None
        self.webdriver = None
        
        self.job_queue = job_queue
        
    
    def init_list(self):
        self.final_dict = {}
        self.product_url_list = []
        self.product_name_list = []
        self.product_price_list = []
        self.product_info_list = []
        self.product_id_list = []
        self.product_img_list = []
        
    def run(self):
        while self.job_queue.empty() is False:
            
            self.init_list()
            
            self.key, self.url = self.get_job()
            
            self.activate_webdriver()
            
            self.scrape_data()
            
            self.deactivate_webdriver()
            
    def get_job(self):
        job = self.job_queue.get()
        return job[0], job[1]
    
    def activate_webdriver(self):
        self.webdriver = webdriver.Chrome("C:\chromedriver_win32\chromedriver")
        
    def deactivate_webdriver(self):
        self.webdriver.close()
        
    def scrape_data(self):
        
        current_page_num = 0
        total_product_num = self.get_total_product_num()
        
        if total_product_num is None:
            return
        
        while len(self.product_url_list) < total_product_num:
            
            current_page_num += 1
            self.url = self.url[:-1] + str(current_page_num)
            
            success = self.get_all_product_url()
            
            if success is False:
                return
            
        self.scrape_each_product()
        
        self.build_final_dict()
        
        self.save_dict_to_pickle()
        
        self.convert_dict_to_df_to_excel()
            
            
            
    def get_total_product_num(self):
        print(self.url)
        self.webdriver.get(self.url)
        time.sleep(3)
        html = self.webdriver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        try:
            result = int(soup.find("span", class_="btn-group page").find_all("b")[1].text)
        except:
            print("{} cannot get total product num".format(self.key))
            result = None
            
        return result
    
    
    def get_all_product_url(self):
        self.webdriver.get(self.url)
        time.sleep(3)
        html = self.webdriver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        try:
            for product in soup.find_all("div", class_="col-xs-6 col-md-4 col-lg-3"):
                self.product_url_list.append("https://www.dcity.com.tw" + product.find("a")["href"])
            return True
        except:
            print("{} cannot get all product url".format(self.key))
            return False
            
            
    def scrape_each_product(self):
        
        print("Worker's Key: {}".format(self.key))
        for idx in tqdm(range(len(self.product_url_list))):
            
            self.webdriver.get(self.product_url_list[idx])
            time.sleep(3)
            html = self.webdriver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # name
            try:
                name = soup.find("h1", itemprop="name").text
            except:
                name = "Fail: {}".format(self.product_url_list[idx])
            
            self.product_name_list.append(name)
            
            
            # price
            try:
                price = soup.find("span", class_="price").text.split("\n")
            except:
                price = "Fail: {}".format(self.product_url_list[idx])
                
            self.product_price_list.append(price)
            
            
            # id
            try:
                _id = soup.find("div", class_="pro-code").find("span").text
            except:
                _id = "Fail: {}".format(self.product_url_list[idx])
            
            self.product_id_list.append(_id)
            
            
            # img
            try:
                img = "https://www.dcity.com.tw" + soup.find("img", itemprop="image")["src"]
            except:
                img = "Fail: {}".format(self.product_url_list[idx])
                
            self.product_img_list.append(img)
            
            
            # info
            try:
                top_info = soup.find("h2", class_="description", itemprop="description").text.strip()
            except:
                top_info = "Fail: {}".format(self.product_url_list[idx])
                
            try:
                bottom_info_list = soup.find("div", id="proList-1").text.strip().split()
                
                a = bottom_info_list.index("規格說明")
                bottom_info_list = bottom_info_list[a:]
                
                bottom_info = ""
                for string in bottom_info_list:
                    bottom_info += ("\n" + string)
            except:
                bottom_info = "Fail: {}".format(self.product_url_list[idx])
                
            info = top_info + "\n" + "\n" + bottom_info
            self.product_info_list.append(info)
            
    
    def build_final_dict(self):
        self.final_dict["品名"] = self.product_name_list
        self.final_dict["價格"] = self.product_price_list
        self.final_dict["詳情"] = self.product_info_list
        self.final_dict["貨號"] = self.product_id_list
        self.final_dict["圖片"] = self.product_img_list
        
    
    def save_dict_to_pickle(self):
        with open(self.key, "wb") as file:
            pickle.dump(self.final_dict, file)
            
    def convert_dict_to_df_to_excel(self):
        df = pd.DataFrame.from_dict(self.final_dict)
        df.to_excel("{}.xlsx".format(self.key))

## Main

In [6]:
worker1 = Worker(job_queue)
worker2 = Worker(job_queue)
worker3 = Worker(job_queue)
worker1.start()
worker2.start()
worker3.start()
worker1.join()
worker2.join()
worker3.join()
print("Done")

https://www.dcity.com.tw/Large/1904020047/?P=1
https://www.dcity.com.tw/Large/1904020049/?P=1
https://www.dcity.com.tw/Large/1904020048/?P=1
Worker's Key: freezer


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))

Worker's Key: japan_refrigerator


HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))

Worker's Key: refrigerator


HBox(children=(FloatProgress(value=0.0, max=145.0), HTML(value='')))


https://www.dcity.com.tw/Large/1904020050/?P=1
Worker's Key: upright_washing_machine


HBox(children=(FloatProgress(value=0.0, max=88.0), HTML(value='')))


https://www.dcity.com.tw/Large/1904020051/?P=1
Worker's Key: drum_washing_machine


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


https://www.dcity.com.tw/Large/1904020052/?P=1
Worker's Key: cloth_dryer


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


https://www.dcity.com.tw/Life/1904020020/?P=1
Worker's Key: vacuum


HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




Done
