# 資料寫入 BigQuery

## 本益比資料

In [None]:
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
from datetime import datetime, timedelta

# 指定時間區段
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 8)

# 創建一個空的 DataFrame 用來存放資料
df = pd.DataFrame()

# 生成日期範圍
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

# 逐一抓取每天的資料
for date in date_range:
    formatted_date = date.strftime('%Y%m%d')
    url = f'https://www.twse.com.tw/rwd/zh/fund/T86?date={formatted_date}&selectType=ALL&response=html'

    response = requests.get(url)
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        # 在這裡找到表格部分
        table = soup.find('table')
        # 確保找到了表格
        if table:
            # 提取表格數據
            df_daily = pd.read_html(str(table), header=1)[0].dropna(how='all', axis=1).dropna(how='any')
            df_daily['Date'] = date  # 加入日期欄位
            df = pd.concat([df, df_daily], ignore_index=True)
            print(f'Processing data for {formatted_date}')

# 顯示合併後的 DataFrame
df = df[['Date', '證券代號', '證券名稱', '外陸資買進股數(不含外資自營商)', '外陸資賣出股數(不含外資自營商)',
       '外陸資買賣超股數(不含外資自營商)', '外資自營商買進股數', '外資自營商賣出股數', '外資自營商買賣超股數', '投信買進股數',
       '投信賣出股數', '投信買賣超股數', '自營商買賣超股數', '自營商買進股數(自行買賣)', '自營商賣出股數(自行買賣)',
       '自營商買賣超股數(自行買賣)', '自營商買進股數(避險)', '自營商賣出股數(避險)', '自營商買賣超股數(避險)',
       '三大法人買賣超股數']]
# df.columns = ['date', 'stock_id', '證券名稱', '外陸資買進股數_不含外資自營商', '外陸資賣出股數_不含外資自營商',
#  '外陸資買賣超股數_不含外資自營商', '外資自營商買進股數', '外資自營商賣出股數', '外資自營商買賣超股數', '投信買進股數',
#  '投信賣出股數', '投信買賣超股數', '自營商買賣超股數', '自營商買進股數_自行買賣', '自營商賣出股數_自行買賣',
#  '自營商買賣超股數_自行買賣', '自營商買進股數_避險', '自營商賣出股數_避險', '自營商買賣超股數_避險',
#  '三大法人買賣超股數']
df.columns = ['date', 'stock_id', 'security_name', 'foreign_investment_buy_no_self', 'foreign_investment_sell_no_self',
 'foreign_investment_net_no_self', 'foreign_dealer_buy', 'foreign_dealer_sell', 'foreign_dealer_net', 'investment_trust_buy',
 'investment_trust_sell', 'investment_trust_net', 'proprietary_trader_net', 'proprietary_trader_buy_self', 'proprietary_trader_sell_self',
 'proprietary_trader_net_self', 'proprietary_trader_buy_hedge', 'proprietary_trader_sell_hedge', 'proprietary_trader_net_hedge',
 'three_major_institution_net']


Processing data for 20240102
Processing data for 20240103
Processing data for 20240104
Processing data for 20240105
Processing data for 20240108


In [None]:
df['date'].unique()

array(['2024-01-02T00:00:00.000000000', '2024-01-03T00:00:00.000000000',
       '2024-01-04T00:00:00.000000000', '2024-01-05T00:00:00.000000000',
       '2024-01-08T00:00:00.000000000'], dtype='datetime64[ns]')

In [None]:
df.shape

(71883, 20)

# 篩選特定公司

In [None]:
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import to_gbq, read_gbq
from google.colab import drive
from google.oauth2 import service_account

drive.mount('/content/gdrive')

# 設定你的Google Cloud帳戶認證檔案路徑
credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# 設定你的BigQuery專案ID
project_id = 'tw-stock-410406'

# Load credentials using google.oauth2.service_account
credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
company_info_df = read_gbq(f'SELECT * FROM {project_id}.financial_data.company_info', project_id=project_id, credentials=credentials_obj, location='US')
# company_info_df.head()

Downloading: 100%|[32m██████████[0m|


In [None]:
company_list = company_info_df['stock_id'].unique()
filtered_data = df[df['stock_id'].isin(company_list)]

In [None]:
filtered_data.tail(3)

Unnamed: 0,date,stock_id,security_name,foreign_investment_buy_no_self,foreign_investment_sell_no_self,foreign_investment_net_no_self,foreign_dealer_buy,foreign_dealer_sell,foreign_dealer_net,investment_trust_buy,investment_trust_sell,investment_trust_net,proprietary_trader_net,proprietary_trader_buy_self,proprietary_trader_sell_self,proprietary_trader_net_self,proprietary_trader_buy_hedge,proprietary_trader_sell_hedge,proprietary_trader_net_hedge,three_major_institution_net
71873,2024-01-08,6770,力積電,12814075,26527840,-13713765,0,0,0,31000,3133,27867,-354806,751000,771037,-20037,44000,378769.0,-334769.0,-14040704.0
71874,2024-01-08,2317,鴻海,6824092,21598200,-14774108,0,0,0,105000,23629,81371,38179,20000,84000,-64000,403000,300821.0,102179.0,-14654558.0
71876,2024-01-08,2353,宏碁,20775883,53790120,-33014237,0,0,0,2129000,118031,2010969,2501779,2828360,1795017,1033343,1837632,369196.0,1468436.0,-28501489.0


In [None]:
filtered_data.shape

(4550, 20)

# 寫入資料庫

In [None]:
# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, credentials=credentials_obj, location='US')
read_df['row_count'][0]

Downloading: 100%|[32m██████████[0m|


879

In [None]:
# 讀取目前表格的資料
current_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, credentials=credentials_obj, location='US')
current_data_count = current_data['row_count'][0]

# 將DataFrame寫入BigQuery
to_gbq(filtered_data, destination_table=f'{project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, if_exists='append', credentials=credentials_obj, location='US')

# 驗證資料筆數
new_data = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, credentials=credentials_obj, location='US')
new_data_count = new_data['row_count'][0]

# 判斷是否寫入
if current_data_count + len(filtered_data) == new_data_count:
    print("資料寫入成功")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(filtered_data))
    print('寫入後資料數：', new_data_count)
else:
    print("資料寫入失敗，筆數不符合預期")
    print('寫入前資料數：', current_data_count)
    print('寫後資料數：', len(filtered_data))
    print('寫入後資料數：', new_data_count)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Downloading: 100%|[32m██████████[0m|


100%|██████████| 1/1 [00:00<00:00, 5769.33it/s]


Downloading: 100%|[32m██████████[0m|
資料寫入成功
寫入前資料數： 8334
寫後資料數： 2778
寫入後資料數： 11112


# 初次寫入

In [None]:
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import to_gbq, read_gbq
from google.colab import drive
from google.oauth2 import service_account

drive.mount('/content/gdrive')

# 設定你的Google Cloud帳戶認證檔案路徑
credentials_path = '/content/gdrive/My Drive/tw-stock.json'

# 設定你的BigQuery專案ID
project_id = 'tw-stock-410406'

# Load credentials using google.oauth2.service_account
credentials_obj = service_account.Credentials.from_service_account_file(credentials_path)
# 將DataFrame寫入BigQuery
to_gbq(filtered_data, destination_table=f'{project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, if_exists='replace', credentials=credentials_obj, location='US')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


100%|██████████| 1/1 [00:00<00:00, 1007.52it/s]


In [None]:
# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT COUNT(*) as row_count FROM {project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, credentials=credentials_obj, location='US')
read_df['row_count'][0]

Downloading: 100%|[32m██████████[0m|


4550

In [None]:
# 從BigQuery中讀取資料
read_df = read_gbq(f'SELECT *  FROM {project_id}.financial_data.institutional_investors_2024_TSE', project_id=project_id, credentials=credentials_obj, location='US')
read_df.tail(3)

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,date,stock_id,security_name,foreign_investment_buy_no_self,foreign_investment_sell_no_self,foreign_investment_net_no_self,foreign_dealer_buy,foreign_dealer_sell,foreign_dealer_net,investment_trust_buy,investment_trust_sell,investment_trust_net,proprietary_trader_net,proprietary_trader_buy_self,proprietary_trader_sell_self,proprietary_trader_net_self,proprietary_trader_buy_hedge,proprietary_trader_sell_hedge,proprietary_trader_net_hedge,three_major_institution_net
4547,2024-01-02 00:00:00+00:00,3042,晶技,198000,198000,0,0,0,0,199000,0,199000,-6000,4000,9000,-5000,7000,8000.0,-1000.0,193000.0
4548,2024-01-02 00:00:00+00:00,2707,晶華,81000,140000,-59000,0,0,0,59231,0,59231,-7000,0,6000,-6000,0,1000.0,-1000.0,-6769.0
4549,2024-01-05 00:00:00+00:00,4164,承業醫,84000,54000,30000,0,0,0,0,0,0,-7000,0,6000,-6000,0,1000.0,-1000.0,23000.0
