https://mops.twse.com.tw/nas/t21/sii/t21sc03_112_11_0.html

In [None]:
import pandas as pd
import requests
from io import StringIO
import time

def monthly_report(year, month):
    # 假如是西元，轉成民國
    if year > 1990:
        year -= 1911

    # 構建網址
    url = f'https://mops.twse.com.tw/nas/t21/sii/t21sc03_{year}_{month}_0.html' if year > 98 else f'https://mops.twse.com.tw/nas/t21/sii/t21sc03_{year}_{month}.html'

    # 偽瀏覽器
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    try:
        # 下載該年月的網站，並用pandas轉換成 dataframe
        r = requests.get(url, headers=headers)
        r.encoding = 'big5'
        dfs = pd.read_html(StringIO(r.text), encoding='big-5')

        # 合併符合條件的 dataframe
        df = pd.concat([df for df in dfs if 5 < df.shape[1] <= 11], ignore_index=True)

        # 設定欄位名稱
        df.columns = df.columns.get_level_values(1)
        df['year'] = year + 1911
        df['month'] = month

        # 偽停頓
        time.sleep(5)

        return df
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # 回傳空的 DataFrame 以示錯誤

In [None]:
# 使用列表生成式簡化迴圈
data_frames = [monthly_report(year, month) for year in range(111, 113) for month in range(1, 13)]

# 合併所有資料
result_df = pd.concat(data_frames, ignore_index=True)

# 設定欄位名稱
result_df.columns = ['stock_id', 'company_name', 'current_month_revenue', 'last_month_revenue',
                     'last_year_current_month_revenue', 'mth_to_mth_change_percent', 'yr_to_yr_change_percent',
                     'current_month_accumulated_revenue', 'last_year_accumulated_revenue', 'ytd_change_percent',
                     'remarks', 'year', 'month']

# 避免多次重新指定結果 DataFrame
result_df = result_df[['stock_id', 'company_name', 'current_month_revenue', 'last_month_revenue',
                       'last_year_current_month_revenue', 'mth_to_mth_change_percent', 'yr_to_yr_change_percent',
                       'current_month_accumulated_revenue', 'last_year_accumulated_revenue', 'ytd_change_percent',
                       'year', 'month']]

# 在最後進行篩選
result_df = result_df[~result_df['stock_id'].isin(['合計', '全部國內上市公司合計'])]

# 印出結果
result_df.dtypes

stock_id                              object
company_name                          object
current_month_revenue                  int64
last_month_revenue                     int64
last_year_current_month_revenue        int64
mth_to_mth_change_percent            float64
yr_to_yr_change_percent              float64
current_month_accumulated_revenue      int64
last_year_accumulated_revenue          int64
ytd_change_percent                   float64
year                                   int64
month                                  int64
dtype: object

In [None]:
# result_df['stock_id'].unique()

In [None]:
result_df.tail(3)

Unnamed: 0,stock_id,company_name,current_month_revenue,last_month_revenue,last_year_current_month_revenue,mth_to_mth_change_percent,yr_to_yr_change_percent,current_month_accumulated_revenue,last_year_accumulated_revenue,ytd_change_percent,year,month
21976,2884,玉山金,5922452,6440468,5167780,-8.04,14.6,66680180,54800105,21.67,2023,12
21979,1437,勤益控,58003,63172,51382,-8.18,12.88,863375,801842,7.67,2023,12
21980,2348,海悅,3040452,552178,696323,450.62,336.64,7754466,5092277,52.27,2023,12
21981,2496,卓越,69414,68481,64168,1.36,8.17,764815,702257,8.9,2023,12
21982,9938,百和,1016891,1076575,838949,-5.54,21.21,12439710,16270386,-23.54,2023,12


In [None]:
result_df.shape

(21178, 12)

In [None]:
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import to_gbq, read_gbq
from google.colab import drive
from google.oauth2 import service_account

drive.mount('/content/gdrive')

# 設定你的Google Cloud帳戶認證檔案路徑
credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# 設定你的BigQuery專案ID
project_id = 'tw-stock-410406'

# Load credentials using google.oauth2.service_account
credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)
# 將DataFrame寫入BigQuery
to_gbq(result_df, destination_table=f'{project_id}.financial_data.monthly_revenue', project_id=project_id, if_exists='replace', credentials=credentials_obj, location='US')

# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')


Mounted at /content/gdrive


100%|██████████| 1/1 [00:00<00:00, 7869.24it/s]


In [None]:
# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')
print(read_df)

Downloading: 100%|[32m██████████[0m|
   row_count
0      21178
