# Web Crawler - Selenium

## Selenium
### 主要2大應用
1. 自動化測試
    - 用於網頁測試，**可以直接驅動瀏覽器，模擬使用者實際操作網站**
    
2. 網頁抓取 (Web Scraping)
    - 有些網站的html會依據執行的JavaScript的內容而變動，因此沒辦法單純的利用Requests與BeautifulSoup套件。
    - 因此就可以利用Selenium，模擬使用者操作執行JavaScript的內容來擷取資料，也可以再利用Requests與BeautifulSoup套件來解析網頁內容。
    
***

### WebDriver
可以用來控制網頁的行為，Selenium支援的瀏覽器都有各自對應的驅動程式。利用驅動程式(driver)才可使瀏覽器自動化。
- Google Chrome: [chromedriver](https://sites.google.com/chromium.org/driver/)
- Mozilla Firefox: [geckodriver](https://firefox-source-docs.mozilla.org/testing/geckodriver/Support.html)
- Microsoft Edge: [Microsoft Edge Driver/Microsoft WebDriver](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/)
- Internet Explorer 11: [IEDriverServer](https://www.selenium.dev/documentation/ie_driver_server/)
- Safari: [safaridriver](https://developer.apple.com/documentation/webkit/about_webdriver_for_safari)
- Opera: [operachromiumdriver](https://github.com/operasoftware/operachromiumdriver/releases)


## 實作3: Selenium

### 爬取的網站: http://autceshap1.corpnet.auo.com/ISO50001/WebForm/Report/reportviewer.aspx?report_id=d800670f-0066-4831-92b6-bebeb579dd05

### Step1. import 套件

In [None]:
import os
import time
import datetime
import numpy as np
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Step2. 設定參數

In [None]:
# === __init__ ===
self.username = ''
self.password = ''

# excel 存放路徑
# => '\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\3.0智慧製造評估專案\2022年\2022-01 淨零碳排專案'
# self.download_path = '\\\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\\3.0智慧製造評估專案\\2022年\\2022-01 淨零碳排專案'
self.download_path = ''

# log file 路徑
# self.log_path = './log_file.txt'
self.log_path = ''

self.driver = None

self.filename = 'Action Plans (Form 3).xls'
self.latestFilename = 'Action Plans (Form 3) (1).xls'

### Step3. 讀取參數

In [None]:
# === readConfig ===

# 建立 ConfigParser
config = configparser.ConfigParser()

# 讀取 INI 設定檔
config.read('./config.ini', encoding='UTF-8')

# 取得設定值，回傳值為str
# print(config['user']['account'])
# print(config['user']['pwd'])

self.username = config['user']['account']
self.password = config['user']['pwd']

# download路徑 '\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\3.0智慧製造評估專案\2022年\2022-01 淨零碳排專案'
self.download_path = config['path']['download_path']
# log file 路徑
self.log_path = config['path']['log_path']

### Step4. 設定瀏覽器參數 (driver.exe要與py檔放同個資料夾)

In [None]:
# === setDriver ===

# 設定使用的瀏覽器
chromeOptions = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': '\\\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\\3.0智慧製造評估專案\\2022年\\2022-01 淨零碳排專案'}
chromeOptions.add_experimental_option('prefs', prefs)
chromeOptions.add_experimental_option('useAutomationExtension', False)

# 可以不讓瀏覽器執行在前景，而是在背景執行（不讓我們肉眼看得見）
chromeOptions.add_argument('--headless')

# 將chromedriver.exe與py檔放在一起!!!!!!!!
chrome_path = ".\chromedriver.exe"

# Get要爬的網站
self.driver = webdriver.Chrome(executable_path = chrome_path, options=chromeOptions)
self.driver.get("http://autceshap1.corpnet.auo.com/ISO50001/WebForm/Report/reportviewer.aspx?report_id=d800670f-0066-4831-92b6-bebeb579dd05")



### Step5. 設定登入

In [None]:
# === auo_login ===

driver = self.driver
# 輸入帳號
driver.find_element_by_id(user_id).send_keys(self.username)
# 輸入密碼
driver.find_element_by_id(pwd_id).send_keys(self.password)

# 點擊登入鍵
driver.find_element_by_id(click_id).click()

### Step6. 刪除舊檔與重新命名

In [None]:
# === deleteOld ===
try:
    if os.path.isfile(self.download_path + '\\' + self.filename):
        os.remove(self.download_path  + '\\' + self.filename)
        print("Delete old file success!")

except Exception as e:
    with open(self.log_path, 'a') as f:
        f.write(str(datetime.datetime.now()) + ', Delete old file error: ' + str(e.args) + '\n')
    print("Delete old file error: ", str(e))

In [None]:
# === renameFile ===

# 先判斷載下來的檔案是否為 Action Plans (Form 3) (1).xls
# 才把 Action Plans (Form 3).xls 之舊檔刪除
# 再將 Action Plans (Form 3) (1).xls 重新命名為 Action Plans (Form 3).xls
# 避免先刪除檔案時如果新的檔案載不下來，會導致資料夾沒有檔案

try:
    if os.path.isfile(self.download_path + '\\' + self.latestFilename):
        os.remove(self.download_path  + '\\' + self.filename)
        os.rename((self.download_path + '\\' + self.latestFilename), (self.download_path  + '\\' + self.filename))
        print("Rename file success!")

except Exception as e:
    with open(self.log_path, 'a') as f:
        f.write(str(datetime.datetime.now()) + ', Rename file error: ' + str(e.args) + '\n')
    print("Rename file error: ", str(e))

### Step7. 下載excel

In [None]:
driver = self.driver
self.auo_login('txtAccount_E', 'txtPwd_E', 'btnLogin_E')

try:
    start = time.time()

    # 等待中
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "btnExportExcel")))

    # 點擊 Action Plans (Export節電資料)
    exportExcel = driver.find_elements_by_xpath("//input[@name='btnExportExcel' and @value='Export All Data to Excel']")[0]
    exportExcel.click()
    time.sleep(5)

    end = time.time()

    # 關閉driver開啟的分頁
    driver.close()

    # 關閉driver的所有頁面
    # driver.quit()

    print("======== exportExcel Success ========")

    with open(self.log_path, 'a') as f:
        f.write(str(datetime.datetime.now()) + ', Export excelData success, take time: ' + str(end-start) + '\n')

except Exception as e:
    print("=-=-=-=- exportExcel Failed -=-=-=-=")

    with open(self.log_path, 'a') as f:
        f.write(str(datetime.datetime.now()) + ', Export excelData failed: ' + str(e.args) + '\n')

    self.getExcel()

***

## Main File

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 14 16:35:09 2022

@author: ErnieWu
"""
import os
import time
import datetime
import numpy as np
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class Robot():
    def __init__(self):
        self.username = ''
        self.password = ''
        
        # excel 存放路徑
        # '\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\3.0智慧製造評估專案\2022年\2022-01 淨零碳排專案'
        # self.download_path = '\\\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\\3.0智慧製造評估專案\\2022年\\2022-01 淨零碳排專案'
        self.download_path = ''
        
        # log file 路徑
        # self.log_path = './log_file.txt'
        self.log_path = ''
        
        self.driver = None
        
        self.filename = 'Action Plans (Form 3).xls'
        self.latestFilename = 'Action Plans (Form 3) (1).xls'
        
    def readConfig(self):
        try:
            # 建立 ConfigParser
            config = configparser.ConfigParser()
            
            # 讀取 INI 設定檔
            config.read('./config.ini', encoding='UTF-8')
            
            # 取得設定值，回傳值為str
#             print(config['user']['account'])
#             print(config['user']['pwd'])
            
            self.username = config['user']['account']
            self.password = config['user']['pwd']
            
            # download路徑 '\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\3.0智慧製造評估專案\2022年\2022-01 淨零碳排專案'
            self.download_path = config['path']['download_path']
            # log file 路徑
            self.log_path = config['path']['log_path']
            
        except Exception as e: # work on python 3.x
            with open(self.log_path, 'a') as f:
                f.write(str(datetime.datetime.now()) + ', Failed to read Config.ini: ' + str(e.args) + '\n')
            print('Failed to read Config.ini: '+ str(e))
    
    def setDriver(self):
        # 設定使用的瀏覽器
        chromeOptions = webdriver.ChromeOptions()
        prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': '\\\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\\3.0智慧製造評估專案\\2022年\\2022-01 淨零碳排專案'}
        chromeOptions.add_experimental_option('prefs', prefs)
        chromeOptions.add_experimental_option('useAutomationExtension', False)
        
        # 可以不讓瀏覽器執行在前景，而是在背景執行（不讓我們肉眼看得見）
        chromeOptions.add_argument('--headless')
        
        # 將chromedriver.exe與py檔放在一起
        chrome_path = ".\chromedriver.exe"
        
        # Get要爬的網站
        self.driver = webdriver.Chrome(executable_path = chrome_path, options=chromeOptions)
        self.driver.get("http://autceshap1.corpnet.auo.com/ISO50001/WebForm/Report/reportviewer.aspx?report_id=d800670f-0066-4831-92b6-bebeb579dd05")
  
    # UAC登入
    def auo_login(self, user_id, pwd_id, click_id):
        driver = self.driver
        driver.find_element_by_id(user_id).send_keys(self.username)
        driver.find_element_by_id(pwd_id).send_keys(self.password)
        # 點擊登入鍵
        driver.find_element_by_id(click_id).click()
    
    # 先刪除前一天的檔案，再下載今天的檔案 (被renameFile取代，備份用)
    def deleteOld(self):
        try:
            if os.path.isfile(self.download_path + '\\' + self.filename):
                os.remove(self.download_path  + '\\' + self.filename)
                print("Delete old file success!")
                
        except Exception as e:
            with open(self.log_path, 'a') as f:
                f.write(str(datetime.datetime.now()) + ', Delete old file error: ' + str(e.args) + '\n')
            print("Delete old file error: ", str(e))

    # ====================================================================================================================
    # 先判斷載下來的檔案是否為 Action Plans (Form 3) (1).xls
    # 才把 Action Plans (Form 3).xls 之舊檔刪除
    # 再將 Action Plans (Form 3) (1).xls 重新命名為 Action Plans (Form 3).xls
    # 避免先刪除檔案時如果新的檔案載不下來，會導致資料夾沒有檔案
    
    def renameFile(self):
        try:
            if os.path.isfile(self.download_path + '\\' + self.latestFilename):
                os.remove(self.download_path  + '\\' + self.filename)
                os.rename((self.download_path + '\\' + self.latestFilename), (self.download_path  + '\\' + self.filename))
                print("Rename file success!")
                
        except Exception as e:
            with open(self.log_path, 'a') as f:
                f.write(str(datetime.datetime.now()) + ', Rename file error: ' + str(e.args) + '\n')
            print("Rename file error: ", str(e))
    # ====================================================================================================================
        
    def getExcel(self):
        driver = self.driver
        self.auo_login('txtAccount_E', 'txtPwd_E', 'btnLogin_E')
            
        try:
            start = time.time()
            
            # 等待中
            element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "btnExportExcel")))
            
            # 點擊 Action Plans (Export節電資料)
            exportExcel = driver.find_elements_by_xpath("//input[@name='btnExportExcel' and @value='Export All Data to Excel']")[0]
            exportExcel.click()
            time.sleep(5)
            
            end = time.time()
            
            # 關閉driver開啟的分頁
            driver.close()
            
            # 關閉driver的所有頁面
            # driver.quit()
            
            print("======== exportExcel Success ========")
            
            with open(self.log_path, 'a') as f:
                f.write(str(datetime.datetime.now()) + ', Export excelData success, take time: ' + str(end-start) + '\n')

        except Exception as e:
            print("=-=-=-=- exportExcel Failed -=-=-=-=")
            
            with open(self.log_path, 'a') as f:
                f.write(str(datetime.datetime.now()) + ', Export excelData failed: ' + str(e.args) + '\n')
                
            self.getExcel()
            

if __name__ =='__main__':
    robot = Robot()
    robot.readConfig()
    robot.setDriver()
    #robot.deleteOld()
    robot.getExcel()
    
    #等待下載完成
    time.sleep(5)
    robot.renameFile()

### Bat檔撰寫

In [None]:
# py檔所在路徑
cd C:\Users\XXXXXX\Documents\Action_Plans
# 執行py檔
python Robot.py

# //如果沒有加下面這行，chromedriver會卡著導致cmd執行完不會關閉畫面
taskkill /F /FI "imagename eq chromedriver.exe"

## (補) 設定檔撰寫

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 11 09:16:22 2022

@author: ErnieWu
"""

import configparser

class generateConfig():
    def __init__(self):
        self.file_path = './config.ini'
        
    def writeConfig(self):
        
        # 建立 ConfigParser
        config = configparser.ConfigParser()
        
        # 建立設定區段
        config['user'] = {'account': 'NT帳號',
                          'pwd': 'NT密碼'}
        
        # 建立設定區段
        # download_path 建立有問題
        config['path'] = {'download_path': '\\auo\gfs\DEF000\MMFE00\MMFE00\LCD1智慧製造\3.0智慧製造評估專案\2022年\2022-01 淨零碳排專案',
                          'log_path': './log_file.txt'}
        
        # 寫入 INI 檔案
        with open('config.ini', 'w', encoding='UTF-8') as configfile:
          config.write(configfile)

    def readConfig(self):
        # 建立 ConfigParser
        config = configparser.ConfigParser()
        
        # 讀取 INI 設定檔
        config.read(self.file_path)
        
        # 取得設定值
        print(config['user']['account'])
        
        # 列出所有區段
        print(config.sections())
        
        # 列出 database 區段下所有設定
        for k in config['user']:
          print("{}: {}".format(k, config['user'][k]))
          
        # 所有以 ConfigParser 從 INI 檔案中讀取出來的資料都是文字型態，數值或布林值的資料在使用前要先轉換，或是改用 getint、getfloat、getboolean 等方式。
        # 轉換為整數
        #port = int(config['database']['port'])
        
        # 讀取並轉換為整數
        #port = config['database'].getint('port')

if __name__ == '__main__':
    genConfig = generateConfig()
    # 生成設定檔
    genConfig.writeConfig()