## Import Module

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import threading
import queue
import time
import re

## Necessary Data

In [2]:
city_name_list = ["桃園市", 
                  "基隆市", 
                  "新竹市", 
                  "新竹縣", 
                  "宜蘭縣", 
                  "臺中市", 
                  "苗栗縣", 
                  "彰化縣", 
                  "南投縣", 
                  "雲林縣",
                  "臺南市",
                  "高雄市",
                  "嘉義市",
                  "嘉義縣",
                  "屏東縣",
                  "臺東縣",
                  "花蓮縣",
                  "澎湖縣",
                  "金門縣",
                  "連江縣"]

In [3]:
job_queue = queue.Queue()

for city in city_name_list:
    job_queue.put(city)

## Worker Class

In [4]:
class Worker(threading.Thread):
    
    def __init__(self, job_queue, base_url):
        
        super().__init__()
        
        self.job_queue = job_queue
        self.base_url = base_url
        self.city_name = None
        self.driver = None
        
        self.final_dict = None
        self.name = None
        self.phone = None
        self.address = None
        
        
        
    def init_store_space(self):
        
        self.final_dict = {}
        self.name_list = []
        self.phone_list = []
        self.address_list = []
        
    
    def run(self):
        
        self.init_webdriver()
        
        while self.job_queue.qsize() >= 1:
            
            self.init_store_space()
            
            self.city_name = self.job_queue.get()
            
            if self.process_page() is False:
                continue
            
            if self.scrape_data() is False:
                continue
            
            self.convert_df()
            
        self.destruct_web_driver()
    
    
    def init_webdriver(self):
        
        self.driver = webdriver.Chrome("C:\chromedriver_win32\chromedriver")
        
    
    def destruct_web_driver(self):
        
        self.driver.close()
        
        
    def process_page(self):
        
        self.driver.get("https://www.panasonic.com/tw/consumer/where-to-buy.html")
        time.sleep(3.5)
        
        try:
            self.driver.find_element_by_xpath('//*[@id="category1"]/option[1]').click()
            self.driver.find_element_by_xpath('//*[@id="a-search"]').send_keys(self.city_name)
            self.driver.find_element_by_xpath('//*[@id="searchsubmit"]').click()
            time.sleep(3.5)
            return True
        except:
            print(self.city_name + " " + "failed in process_page.")
            return False
        
    
    def scrape_data(self):
        
        html = self.driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        soup = soup.find_all("div", class_="shop")
        
        if len(soup) == 0:
            print(self.city_name + " " + "failed in scrape_data.")
            return False
        
        print(self.city_name + " " + "Scraped !")
        for idx in tqdm(range(len(soup))):
            
            shop = soup[idx]
            
            try:
                na = shop.find("h3", class_="subheader3").text
            except:
                na = None
            self.name_list.append(na)
            
            try:
                add = shop.find_all("p", class_="bodycopy1")[1].text
            except:
                add = None
            self.address_list.append(add)
            
            try:
                ph = shop.find_all("p", class_="bodycopy1")[3].text
                ph = re.findall("\d+-\d+", ph)[0]
            except:
                ph = None
            self.phone_list.append(ph)
            
        self.final_dict["公司名稱"] = self.name_list
        self.final_dict["地址"] = self.address_list
        self.final_dict["電話"] = self.phone_list
        
        
    def convert_df(self):
        
        df = pd.DataFrame.from_dict(self.final_dict)
        df.to_excel(self.city_name+".xlsx")
        

## Main Driver

In [5]:
url = "https://www.panasonic.com/tw/consumer/where-to-buy.html"

worker1 = Worker(job_queue, url)
worker1.start()

worker2 = Worker(job_queue, url)
worker2.start()

worker3 = Worker(job_queue, url)
worker3.start()

worker4 = Worker(job_queue, url)
worker4.start()

worker5 = Worker(job_queue, url)
worker5.start()

金門 Scraped !


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


