In [1]:
import pandas as pd
import numpy as np
import json
import requests
import time

In [2]:
def daterange(start , end):
    d0 = pd.to_datetime(start)
    d1 = pd.to_datetime(end)
    return [d.strftime("%Y%m%d") for d in pd.date_range(d0, d1, freq = "D")]

In [3]:
COLS = ["日期","代號","名稱","成交股數","成交筆數","成交金額","開盤價","最高價","最低價","收盤價"]
NUM_COLS = ["成交股數","成交筆數","成交金額","開盤價","最高價","最低價","收盤價"]

def data_clean(data_list):
    out = []
    for date_raw, rows in data_list:
        date = f"{date_raw[:4]}-{date_raw[4:6]}-{date_raw[6:8]}"
        rows9 = [r[:9] for r in rows if isinstance(r, list) and len(r) >= 9 and isinstance(r[0], str)]
        start = next((i for i, r in enumerate(rows9) if r[0].strip() == "1101"), None)
        out.extend([[date] + r for r in rows9[start:]])

    df = pd.DataFrame(out, columns=COLS)
    
    for c in NUM_COLS:
        df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", "", regex=False), errors="coerce")
        
    return df.sort_values(["日期","代號"], ignore_index=True)

In [5]:
start = ['20200101','20230101']
end = ['20221231','20241231']

df = pd.DataFrame()
for i in range(len(start)):
    data_list = []
    for date in daterange(start[i] , end[i]):
        url = f"https://www.twse.com.tw/rwd/zh/afterTrading/MI_INDEX?response=json&type=ALLBUT0999&date={date}"
        data = requests.get(url).text
        data = json.loads(data)
        if data == {'stat': '很抱歉，沒有符合條件的資料!'}:
            time.sleep(1 + np.random.uniform(0, 1))
            continue
        else:
            need = [row[:9] for row in (data["tables"][8]['data'])]
            data_list.append([date,need])
            time.sleep(2 + np.random.uniform(0, 1))
    df = pd.concat([df,data_clean(data_list)])

In [6]:
df

Unnamed: 0,日期,代號,名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價
0,2020-01-02,1101,台泥,18470566,6251,813465904,43.80,44.15,43.80,44.10
1,2020-01-02,1101B,台泥乙特,12000,10,643200,53.50,53.80,53.50,53.80
2,2020-01-02,1102,亞泥,8890485,4391,433140140,48.10,49.00,48.05,48.90
3,2020-01-02,1103,嘉泥,2194046,883,49255964,22.40,22.70,22.35,22.35
4,2020-01-02,1104,環泥,867516,384,17026458,19.60,19.70,19.55,19.65
...,...,...,...,...,...,...,...,...,...,...
496953,2024-12-31,9944,新麗,115881,134,2329626,20.05,20.30,19.95,20.30
496954,2024-12-31,9945,潤泰新,4357585,3703,186985362,43.30,43.35,42.65,42.90
496955,2024-12-31,9946,三發地產,156969,161,3551688,22.95,22.95,22.50,22.60
496956,2024-12-31,9955,佳龍,123625,178,3521198,28.70,28.70,28.30,28.50


In [11]:
mask_stock = df["代號"].astype(str).str.strip().str.fullmatch(r"\d{4}")
df = df[mask_stock]

In [12]:
df.to_csv("WebData.csv", index = False)