In [1]:
!pip install -q selenium==4.3.0

In [2]:
!pip install -q lxml==4.9.1

In [3]:
!pip install -q beautifulsoup4==4.11.1

In [4]:
# !pip install -q mysql-connector-python==8.0.30

In [5]:
# !pip install -q sqlalchemy==1.4.39

In [6]:
!pip install -q backoff==2.1.2
import backoff
backoff.__version__

'2.1.2'

In [7]:
!pip install -q pandas

In [8]:
!pip install python-dotenv



In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium
import time
import random
import re
import sys
from bs4 import BeautifulSoup
import pandas as pd
import backoff
from datetime import datetime
import logging
from logging import handlers

In [10]:
#- 設定 logger
logger = logging.getLogger('family_main')
logger.setLevel(logging.INFO)

log_format = logging.Formatter('%(asctime)s [%(module)s] %(levelname)s [%(lineno)d] %(message)s', '%Y-%m-%d %H:%M:%S %Z')

th = handlers.TimedRotatingFileHandler(filename='app.log', when='D', backupCount=7, encoding='utf-8')
th.setFormatter(log_format)
th.setLevel(logging.INFO)
logger.addHandler(th)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(log_format)
logger.addHandler(ch)

logger.info('family_main logger ready...')

2024-10-02 18:57:01 Asia [627094747] INFO [17] family_main logger ready...


In [11]:
def set_ua():
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    return user_agent

In [12]:
def little_sleep(min=1, max=12):
    sleep_time = random.randint(min,max)
    logger.info(f'sleep time: {sleep_time}')
    time.sleep(sleep_time)

In [13]:
@backoff.on_exception(backoff.expo,
                        Exception, 
                      max_time=10)
def chrome_init():
    chrome_opt = webdriver.ChromeOptions()
    chrome_opt.add_argument('--headless')
    chrome_opt.add_argument('--no-sandbox')
    chrome_opt.add_argument('--ignore-ssl-errors=yes')
    chrome_opt.add_argument('--ignore-certificate-errors')
    chrome_opt.add_argument(f'user-agent={set_ua()}')
    # chrome_opt.add_argument("--incognito")  # 使用無痕模式。用 selenium開瀏覽器已經很乾淨了，但疑心病重的可以用一下
    try:
        driver = webdriver.Remote(
            command_executor='http://selenium-hub:4444/wd/hub',
            options=chrome_opt
        )
    except Exception as e:
        logger.error(f'chrome init error: {e}')
        raise Exception(e)
    return driver

### 先到 https://foodsafety.family.com.tw 找一個商品

In [14]:
product_id = '0612008' #剛剛找到的商品id
url = f'https://foodsafety.family.com.tw/Web_FFD_2022/product/{product_id}'
# url = 'https://www.google.com'
logger.info(f'url: {url}')



2024-10-02 18:57:01 Asia [2404257831] INFO [4] url: https://foodsafety.family.com.tw/Web_FFD_2022/product/0612008


In [15]:
driver = chrome_init()


In [16]:
driver.get(url)
print(f'title: {driver.title}')
html = driver.page_source
#- print html
print(html)
#- 在這裡還是沒看到 render 後的 html

title: 全家食在購安心
<html><head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>全家食在購安心</title>
    <meta name="author" content="Family Co.,Ltd.">
    <meta name="copyright" content="Copyright(c) ©Family Co.,Ltd.">
    <meta name="keywords" content="全家,Family,Mart,食安,食在購安心">
    <meta name="description" content="全家食在購安心 新鮮│健康│安心">
    <meta property="og:title" content="全家食在購安心">
    <link href="https://fonts.googleapis.com/css2?family=Noto+Sans+TC:wght@400;500;700&amp;display=swap" rel="stylesheet">
    <link href="/Web_FFD_2022/css/index.css?202309111" rel="stylesheet">
    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-C8BC6KQ4PN&amp;l=dataLayer&amp;cx=c"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=G-MFRFWEK6Z7"></script>


In [17]:
#- find element by css selector
item_css_selector = '#app > div > div.container > div.resume-wrap > div:nth-child(1) > div.resume__info'
#- wait for element to be loaded
wait = WebDriverWait(driver, 10)
elements = wait.until(lambda driver: driver.find_element(By.CSS_SELECTOR , item_css_selector))
print(f"elements html: {elements.get_attribute('innerHTML')}")
#- for all p elements in the div
p_elements = elements.find_elements(By.TAG_NAME , 'p')
for p in p_elements:
    print(f"p text: {p.text}")

elements html: <p>碳水化合物 <span>32</span> 公克</p><p>蛋白質 <span>5</span> 公克</p><p>脂肪 <span>4.3</span> 公克</p>
p text: 碳水化合物
32
公克
p text: 蛋白質
5
公克
p text: 脂肪
4.3
公克


In [18]:
driver.close()
driver.quit()

In [19]:
#- 合併成一個程式
try:
    driver = chrome_init()
    driver.get(url)
    print(f'title: {driver.title}')
    html = driver.page_source
    print(html)
    #- find element by css selector
    item_css_selector = '#app > div > div.container > div.resume-wrap > div:nth-child(1) > div.resume__info'
    #- wait for element
    wait = WebDriverWait(driver, 10)
    elements = wait.until(lambda driver: driver.find_element(By.CSS_SELECTOR , item_css_selector))
    print(f"elements html: {elements.get_attribute('innerHTML')}")
    #- for all p elements in the div
    p_elements = elements.find_elements(By.TAG_NAME , 'p')
    for p in p_elements:
        print(f"p text: {p.text}")

except Exception as e:
    logger.error(e)
finally:
    if driver is not None:
        driver.close()
        driver.quit()
        logger.info(f'driver: {driver}')

title: 全家食在購安心
<html><head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>全家食在購安心</title>
    <meta name="author" content="Family Co.,Ltd.">
    <meta name="copyright" content="Copyright(c) ©Family Co.,Ltd.">
    <meta name="keywords" content="全家,Family,Mart,食安,食在購安心">
    <meta name="description" content="全家食在購安心 新鮮│健康│安心">
    <meta property="og:title" content="全家食在購安心">
    <link href="https://fonts.googleapis.com/css2?family=Noto+Sans+TC:wght@400;500;700&amp;display=swap" rel="stylesheet">
    <link href="/Web_FFD_2022/css/index.css?202309111" rel="stylesheet">
    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-C8BC6KQ4PN&amp;l=dataLayer&amp;cx=c"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=G-MFRFWEK6Z7"></script>


2024-10-02 18:57:06 Asia [2415837859] INFO [25] driver: <selenium.webdriver.remote.webdriver.WebDriver (session="6519b520f848d8c4885bbf04e3311d93")>


elements html: <p>碳水化合物 <span>32</span> 公克</p><p>蛋白質 <span>5</span> 公克</p><p>脂肪 <span>4.3</span> 公克</p>
p text: 碳水化合物
32
公克
p text: 蛋白質
5
公克
p text: 脂肪
4.3
公克
