In [1]:
import pandas as pd
import re, requests
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import timeit

class spider104:
    def __init__(self, keyword, area):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
        self.keyword = keyword
        self.area = area
        
    def openBrowser(self):
        """目前只開放新竹縣市、台北市、新北市"""
        driver = self.driver
        
        areadict = {'台北市':'6001001000','新北市':'6001002000','新竹縣市':'6001006000'}
        my_params = {'ro':'1', # 限定全職的工作，如果不限定則輸入0
             'keyword':self.keyword, # 想要查詢的關鍵字
             'area': areadict[self.area], 
             'isnew':'30', # 只要最近一個月有更新的過的職缺
             'mode':'l'} # 清單的瀏覽模式

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}

        url = requests.get('https://www.104.com.tw/jobs/search/?' , my_params, headers = headers).url
    
        
        driver.get(url)
    
    def scrollDown(self):
        """向下滑到第15頁"""
        driver = self.driver
        
        sleep(0.5)
        for i in range(20): 
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(0.6)
    
    def clickNextPage(self):
        """手動點擊下一頁直到沒辦法為止"""
        driver = self.driver
        
        nextp = 1
        count = 0
        while nextp != 0:    
            try:
                driver.find_elements_by_class_name('js-more-page')[count].click()
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                sleep(0.6)
                count +=1

            except:
                nextp = 0
                
    
    def getAllLinks(self):
        """抓取所有職缺的連結"""
        driver = self.driver
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        List = soup.findAll('a',{'class':'js-job-link'})
        return (List)
    
    def scrapeData(self,jobDataFrame):
        driver = self.driver
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        jobDetails = soup.findAll('div',{'class':'job-description-table__data'})
        requirements = soup.findAll('div',{'class':'job-requirement-table__data'})

        df = pd.DataFrame(data= [{
            'title':soup.h1.attrs['title'],
            'company':soup.find('a',{'class':'mr-6'}).attrs['title'],
            'jobDscrp':soup.find('p',{'class':'text-break'}).text.replace('\r','').replace('\n',' '),
            'jobType':jobDetails[0].text.strip(),
            'salary':jobDetails[1].text.strip(),
            'location':jobDetails[3].text.strip(),
            'startTime':jobDetails[-2].text.strip(),
            'workExp':requirements[1].text.strip(),
            'edu':requirements[2].text.strip(),
            'fieldofStudy':requirements[3].text.strip(),
            'language':requirements[4].text.strip(),
            'tool':requirements[5].text.strip(),
            'skills':requirements[6].text.strip(),
            'others':requirements[-1].text.strip()}],
            columns = ['title','company','jobDscrp','jobType','salary','location','startTime','workExp','edu',
                       'fieldofStudy','language','tool','skills','others'],
            )
        
        jobDataFrame = jobDataFrame.append(df, ignore_index=True)
        return(jobDataFrame)
    
    def closeBrowser(self):
        driver = self.driver
        driver.close()
        
        
    def goToPage(self, link):
        driver = self.driver
        driver.get(link)
    


In [3]:
def spider(keyword, area):
    """創建 jobs dataframe 抓資料"""
    start = timeit.default_timer()
    spider = spider104(keyword, area)
    spider.openBrowser()
    spider.scrollDown()
    spider.clickNextPage()
    List = spider.getAllLinks()
    print('共有'+ str(len(List)) + '筆資料')

    jobsDF = pd.DataFrame()

    for i in range(len(List)):
        link = List[i].attrs['href'].strip('//')
        spider.goToPage('https://' + link)
        sleep(0.35)
        try:
            jobsDF = spider.scrapeData(jobsDF)

            percentage = round((i+1)/len(List),2)
            if percentage == 0.25:
                print('目前進度至 25% 已抓取' + str(i+1) +'筆資料')
            elif percentage == 0.5:
                print('目前進度至 50% 已抓取' + str(i+1) +'筆資料')
            elif percentage == 0.75:
                print('目前進度至 75% 已抓取' + str(i+1) +'筆資料')
            elif percentage == 1:
                print('任務完成！ 已抓取' + str(i+1) +'筆資料')
        except:
            try:
                jobsDF = spider.scrapeData(jobsDF)

                percentage = round((i+1)/len(List),2)
                if percentage == 0.25:
                    print('目前進度至 25% 已抓取' + str(i+1) +'筆資料')
                elif percentage == 0.5:
                    print('目前進度至 50% 已抓取' + str(i+1) +'筆資料')
                elif percentage == 0.75:
                    print('目前進度至 75% 已抓取' + str(i+1) +'筆資料')
                elif percentage == 1:
                    print('任務完成！ 已抓取' + str(i+1) +'筆資料')
            
            except Exception as e:
                print(e)

    spider.closeBrowser()
    stop = timeit.default_timer()
    print('Time: ', stop - start) 

    return (jobsDF)