# 資料寫入 BigQuery

## 本益比資料

In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import warnings

# 避免警告顯示
warnings.filterwarnings("ignore")

start_date = datetime(2024, 1, 8)
end_date = datetime(2024, 1, 8)
current_date = start_date

# 創建一個空的DataFrame，並將欄位名稱調整為符合BigQuery的格式
columns = ["date", "stock_code", "stock_name", "dividend_yield", "dividend_year", "p_e_ratio", "p_b_ratio", "financial_report"]
df = pd.DataFrame(columns=columns)

while current_date <= end_date:
    formatted_date = current_date.strftime('%Y%m%d')
    url = f'https://www.twse.com.tw/exchangeReport/BWIBBU_d?response=html&date={formatted_date}&selectType=ALL'

    response = requests.get(url)
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        # 在這裡找到表格部分
        table = soup.find('table')
        # 確保找到了表格
        if table:
            # 提取表格數據
            rows = table.find_all('tr')[2:]  # 忽略前兩行標題
            for row in rows:
                data = [current_date] + [td.text.strip() for td in row.find_all('td')]
                df = df.append(pd.Series(data, index=columns), ignore_index=True)

        # 印出日期以確認程式正在運作
        print(f'Processing data for {formatted_date}')
    else:
        print(f'Failed to fetch data for {formatted_date}')

    current_date += timedelta(days=1)

Processing data for 20240108


In [3]:
df['date'].unique()

array(['2024-01-08T00:00:00.000000000'], dtype='datetime64[ns]')

In [4]:
df.shape

(997, 8)

In [5]:
df.head()

Unnamed: 0,date,stock_code,stock_name,dividend_yield,dividend_year,p_e_ratio,p_b_ratio,financial_report
0,2024-01-08,1101,台泥,1.46,111,29.03,1.13,112/3
1,2024-01-08,1102,亞泥,5.57,111,14.39,0.87,112/3
2,2024-01-08,1103,嘉泥,2.66,111,72.31,0.58,112/3
3,2024-01-08,1104,環泥,6.14,111,8.75,0.9,112/3
4,2024-01-08,1108,幸福,5.03,111,8.64,1.29,112/3


# 寫入資料庫

In [6]:
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import to_gbq, read_gbq
from google.colab import drive
from google.oauth2 import service_account

drive.mount('/content/gdrive')

# 設定你的Google Cloud帳戶認證檔案路徑
credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# 設定你的BigQuery專案ID
project_id = 'tw-stock-410406'

# Load credentials using google.oauth2.service_account
credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)

# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.stock_PE', project_id=project_id, credentials=credentials_obj, location='US')
read_df['row_count'][0]

Mounted at /content/gdrive
Downloading: 100%|[32m██████████[0m|


1388350

In [7]:
# 讀取目前表格的資料
current_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.stock_PE', project_id=project_id, credentials=credentials_obj, location='US')
current_data_count = current_data['row_count'][0]

# 將DataFrame寫入BigQuery
to_gbq(df, destination_table=f'{project_id}.financial_data.stock_PE', project_id=project_id, if_exists='append', credentials=credentials_obj, location='US')

# 驗證資料筆數
new_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.stock_PE', project_id=project_id, credentials=credentials_obj, location='US')
new_data_count = new_data['row_count'][0]

# 判斷是否寫入
if current_data_count + len(df) == new_data_count:
    print("資料寫入成功")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(df))
    print('寫入後資料數：', new_data_count)
else:
    print("資料寫入失敗，筆數不符合預期")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(df))
    print('寫入後資料數：', new_data_count)

Downloading: 100%|[32m██████████[0m|


100%|██████████| 1/1 [00:00<00:00, 6141.00it/s]


Downloading: 100%|[32m██████████[0m|
資料寫入成功
寫入前資料數： 1388350
寫後資料數： 997
寫入後資料數： 1389347


# 初次寫入

In [None]:
# import pandas as pd
# from google.cloud import bigquery
# from pandas_gbq import to_gbq, read_gbq
# from google.colab import drive
# from google.oauth2 import service_account

# drive.mount('/content/gdrive')

# # 設定你的Google Cloud帳戶認證檔案路徑
# credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# # 設定你的BigQuery專案ID
# project_id = 'tw-stock-410406'

# # Load credentials using google.oauth2.service_account
# credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)
# # 將DataFrame寫入BigQuery
# to_gbq(df, destination_table=f'{project_id}.financial_data.stock_PE', project_id=project_id, if_exists='replace', credentials=credentials_obj, location='US')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


100%|██████████| 1/1 [00:00<00:00, 8542.37it/s]
