In [3]:
import requests, shutil, time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import json
import time

retry = 5
success = False

if retry == 0:
    print('下載失敗以達5次，停止程式')

# 到目標頁面抓驗證圖
while not success and retry > 0:  # 已成功或失敗超過5次就停止
    resp = requests.get("https://bsr.twse.com.tw/bshtm/bsMenu.aspx")
    soup = BeautifulSoup(resp.text, "html.parser")
    nodes = soup.select("form input")
    params = {}
    for node in nodes:
        name = node.attrs["name"]

        # 忽略鉅額交易
        if name in ("RadioButton_Excd", "Button_Reset"):
            continue

        if "value" in node.attrs:
            params[node.attrs["name"]] = node.attrs["value"]
        else:
            params[node.attrs["name"]] = ""
    img_tags = soup.select("#Panel_bshtm img")
    src = img_tags[0].get("src")
    resp = requests.get("https://bsr.twse.com.tw/bshtm/" + src)
    if resp.status_code == 200:
        with open("tmp.png", "wb") as f:
            f.write(resp.content)
    else:
        print("error")

    # cv2處理驗證圖
    image = cv2.imread("tmp.png")
    kernel1 = np.ones((4, 4), np.uint8)
    erosion = cv2.erode(image, kernel1, iterations=1)
    blurred = cv2.GaussianBlur(erosion, (5, 5), 0)
    edged = cv2.Canny(blurred, 50, 150)
    dillation = cv2.dilate(edged, kernel1, iterations=1)

    # 計算邊緣，並切割驗證圖
    contours, hierarchy = cv2.findContours(
        dillation.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    cnts = sorted([(c, cv2.boundingRect(c)[0]) for c in contours], key=lambda x: x[1])
    ary = []
    for c, _ in cnts:
        (x, y, w, h) = cv2.boundingRect(c)
        if w > 20 and h > 20:
            ary.append((x, y, w, h))

    for id, (x, y, w, h) in enumerate(ary):
        roi = dillation[y : y + h, x : x + w]
        thresh = roi.copy()
        res = cv2.resize(thresh, (50, 50))
        cv2.imwrite("tmp/%d.png" % (id), res)

    # mse計算2張字元圖相似度
    def mse(imageA, imageB):
        err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
        err / imageA.shape[0] * imageA.shape[1]
        return err

    # 回傳mse最低(相似度最高)的答案
    def getNumber(pic):
        min_a = 99999999999999
        min_png = None
        for png in os.listdir("DELTA"):
            ref = cv2.imread("DELTA/" + png)
            if mse(ref, pic) < min_a:
                min_a = mse(ref, pic)
                min_png = png
        return min_png, min_a

    # 計算驗證圖的答案
    answer = ""
    for pngTmp in os.listdir("tmp"):
        letterTmp = cv2.imread("tmp/" + pngTmp)
        os.remove("tmp/" + pngTmp)
        answerLetter = getNumber(letterTmp)
        answer += answerLetter[0].split(".png")[0]

    params["CaptchaControl1"] = answer
    params["TextBox_Stkno"] = "2330"

    # 送出
    session = requests.Session()
    resp = session.post("https://bsr.twse.com.tw/bshtm/bsMenu.aspx", data=params)
    if resp.status_code != 200:
        print("任務失敗: %d" % resp.status_code)
        retry -= 1
        exit(1)

    soup = BeautifulSoup(resp.text, "html.parser")
    nodes = soup.select("#HyperLink_DownloadCSV")
    # 失敗的話休息10秒並重載驗證圖
    if len(nodes) == 0:
        print("任務失敗，沒有下載連結")
        retry -= 1
        time.sleep(10)
        exit(1)
    resp = session.get("https://bsr.twse.com.tw/bshtm/bsContent.aspx")
    # 失敗的話休息10秒並重載驗證圖
    if resp.status_code != 200:
        print("任務失敗，無法下載分點進出 CSV")
        retry -= 1
        time.sleep(10)
        exit(1)
    # 成功的話更新success狀態並處理資料
    if len(resp.text) > 0:
        success = True
        data = []
        columns = resp.text.split("\n")[2].strip("\r").split(",")[:5]
        for doubleRow in resp.text.split("\n")[3:]:
            data.append(doubleRow.split(",")[:5])
            data.append(doubleRow.split(",")[6:])
        df = pd.DataFrame(data, columns=columns)
        df.to_csv("買賣日報表.csv", encoding="big5", errors="ignore", index=False)
        print("成功下載資料")


任務失敗，沒有下載連結
任務失敗，沒有下載連結
成功下載資料
