https://mops.twse.com.tw/nas/t21/otc/t21sc03_112_11_0.html

In [1]:
import pandas as pd
import requests
from io import StringIO
import time

def monthly_report(year, month):
    # 假如是西元，轉成民國
    if year > 1990:
        year -= 1911

    # 構建網址
    url = f'https://mops.twse.com.tw/nas/t21/otc/t21sc03_{year}_{month}_0.html' if year > 98 else f'https://mops.twse.com.tw/nas/t21/otc/t21sc03_{year}_{month}.html'

    # 偽瀏覽器
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    try:
        # 下載該年月的網站，並用pandas轉換成 dataframe
        r = requests.get(url, headers=headers)
        r.encoding = 'big5'
        dfs = pd.read_html(StringIO(r.text), encoding='big-5')

        # 合併符合條件的 dataframe
        df = pd.concat([df for df in dfs if 5 < df.shape[1] <= 11], ignore_index=True)

        # 設定欄位名稱
        df.columns = df.columns.get_level_values(1)
        df['year'] = year + 1911
        df['month'] = month

        # 偽停頓
        time.sleep(5)

        return df
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # 回傳空的 DataFrame 以示錯誤

In [2]:
# 使用列表生成式簡化迴圈
data_frames = [monthly_report(year, month) for year in range(111, 113) for month in range(1, 13)]

# 合併所有資料
result_df = pd.concat(data_frames, ignore_index=True)

# 設定欄位名稱
result_df.columns = ['stock_id', 'company_name', 'current_month_revenue', 'last_month_revenue',
                     'last_year_current_month_revenue', 'mth_to_mth_change_percent', 'yr_to_yr_change_percent',
                     'current_month_accumulated_revenue', 'last_year_accumulated_revenue', 'ytd_change_percent',
                     'remarks', 'year', 'month']

# 避免多次重新指定結果 DataFrame
result_df = result_df[['stock_id', 'company_name', 'current_month_revenue', 'last_month_revenue',
                       'last_year_current_month_revenue', 'mth_to_mth_change_percent', 'yr_to_yr_change_percent',
                       'current_month_accumulated_revenue', 'last_year_accumulated_revenue', 'ytd_change_percent',
                       'year', 'month']]

# 在最後進行篩選
result_df = result_df[~result_df['stock_id'].isin(['合計', '全部國內上櫃公司合計'])]

# 印出結果
result_df.dtypes

stock_id                              object
company_name                          object
current_month_revenue                  int64
last_month_revenue                     int64
last_year_current_month_revenue        int64
mth_to_mth_change_percent            float64
yr_to_yr_change_percent              float64
current_month_accumulated_revenue      int64
last_year_accumulated_revenue          int64
ytd_change_percent                   float64
year                                   int64
month                                  int64
dtype: object

In [10]:
result_df['stock_id'].unique()

array(['德麥', '金穎生技', '中華食', '環泰', '福格創新', '綠茵', '信立', '勝昱', '世坤', '萬國通',
       '東隆興', '福大', '新昕纖', '飛寶企業', '金洲', '元勝', '光明', '銘旺實', '興采', '雙邦',
       '力肯', '新麥', '和勤', '宏佳騰', '世德', '嘉鋼', '精湛', '泰茂', '謚源', '精確', '至寶電',
       '彬台', '台興', '元創精密', '健信', '金雨', '崇友', '高鋒', '福裕', '永彰', '方方土霖',
       '江興鍛', '協易機', '慶騰', '至興', '大詠城', '萬在', '桓達', '長佳', '寶緯', '健椿',
       '百德', '科際精密', '捷流閥業', '君帆', '擎邦', '易發', '富強鑫', '瀧澤科', '千附精密', '進典',
       '鈦昇', '瑞穎', '大億金茂', '朋程', '皇田', '風青', '台蠟', '奇鈦科', '大恭', '磐亞',
       '永純', '永捷', '大立', '美琪瑪', '泓瀚', '國碳科', '誠泰科技', '晶呈科技', '聚和', '精華',
       '生泰', '合世', '訊聯', '杏昌', '易威', '寶利徠', '進階', '基亞', '佰研', '大學光', '永日',
       '東洋', '邦特', '加捷生醫', '濟生', '健喬', '明基醫', '友華', '優盛', '晟德', '太醫',
       '天良', '中天', '聯合', '健亞', '浩泰', '曜亞', '中裕', '鈺緯', '訊聯基因', '聿新科',
       '智擎', '鐿鈦', '松瑞藥', '醣聯', '久裕', '浩鼎', '杏一', '福永生技', '安克', '杏國',
       '欣大健康', '永昕', '雙美', '豪展', '合一', '皇將', '強生製藥', '德英', '寶島科', '上亞科技',
       '立康', '藥華藥', '益得', '大樹', '生華科', '科懋', '益安', '達爾膚', '明達醫

In [4]:
result_df.tail(3)

Unnamed: 0,stock_id,company_name,current_month_revenue,last_month_revenue,last_year_current_month_revenue,mth_to_mth_change_percent,yr_to_yr_change_percent,current_month_accumulated_revenue,last_year_accumulated_revenue,ytd_change_percent,year,month
18882,6811,宏�硌穈T,886976,324550,860189,173.29,3.11,7545564,7201012,4.78,2023,12
18884,6804,明係,276383,240809,304364,14.77,-9.19,3363294,3322572,1.22,2023,12
18886,全部國內上櫃公司合計,全部國內上櫃公司合計,37971672,40797741,37362846,-6.92,1.62,434399438,476056365,-8.75,2023,12


In [8]:
result_df = result_df[result_df['stock_id'] != '全部國內上櫃公司合計']

In [11]:
result_df.shape

(18197, 12)

In [12]:
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import to_gbq, read_gbq
from google.colab import drive
from google.oauth2 import service_account

drive.mount('/content/gdrive')

# 設定你的Google Cloud帳戶認證檔案路徑
credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# 設定你的BigQuery專案ID
project_id = 'tw-stock-410406'

# Load credentials using google.oauth2.service_account
credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)

# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')
read_df['row_count'][0]

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Downloading: 100%|[32m██████████[0m|


21178

In [13]:
# 讀取目前表格的資料
current_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')
current_data_count = current_data['row_count'][0]

# 將DataFrame寫入BigQuery
to_gbq(result_df, destination_table=f'{project_id}.financial_data.monthly_revenue', project_id=project_id, if_exists='append', credentials=credentials_obj, location='US')

# 驗證資料筆數
new_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')
new_data_count = new_data['row_count'][0]

# 判斷是否寫入
if current_data_count + len(result_df) == new_data_count:
    print("資料寫入成功")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(result_df))
    print('寫入後資料數：', new_data_count)
else:
    print("資料寫入失敗，筆數不符合預期")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(result_df))
    print('寫入後資料數：', new_data_count)

Downloading: 100%|[32m██████████[0m|


100%|██████████| 1/1 [00:00<00:00, 1620.67it/s]


Downloading: 100%|[32m██████████[0m|
資料寫入成功
寫入前資料數： 21178
寫後資料數： 18197
寫入後資料數： 39375


In [None]:
# import pandas as pd
# from google.cloud import bigquery
# from pandas_gbq import to_gbq, read_gbq
# from google.colab import drive
# from google.oauth2 import service_account

# drive.mount('/content/gdrive')

# # 設定你的Google Cloud帳戶認證檔案路徑
# credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# # 設定你的BigQuery專案ID
# project_id = 'tw-stock-410406'

# # Load credentials using google.oauth2.service_account
# credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)
# # 將DataFrame寫入BigQuery
# to_gbq(result_df, destination_table=f'{project_id}.financial_data.monthly_revenue', project_id=project_id, if_exists='replace', credentials=credentials_obj, location='US')

# # 從BigQuery中讀取資料
# read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')


Mounted at /content/gdrive


100%|██████████| 1/1 [00:00<00:00, 7869.24it/s]


In [None]:
# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.monthly_revenue', project_id=project_id, credentials=credentials_obj, location='US')
print(read_df)

Downloading: 100%|[32m██████████[0m|
   row_count
0      21178
