In [None]:
!pip install pandas yfinance pyarrow google-cloud-bigquery google-cloud-storage
import os
import pandas as pd
import yfinance as yf
from google.cloud import bigquery
from google.cloud import storage
from modules.gcp_class import Gcs_client, Bigquery_client
import datetime as dt

def main():
    # 環境変数で認証情報を設定
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./dbt-analytics-engineer-435907-75a25995915e.json"
    
    # BigQueryとGCSの設定
    PROJECT_ID = 'dbt-analytics-engineer-435907'
    DATASET_NAME = 'stock_dataset'
    BUCKET_NAME = 'stock-data-bucket_hopop'
    TABLE_NAME = 'stock_data'

    # CSVファイルのパス
    STOCK_MAPPING_CSV = '../stock_code_name_mapping.csv'

    # データ取得期間の設定
    START_DATE = dt.date(2024, 1, 1)
    END_DATE = dt.date.today()

    # GCSクライアントとBigQueryクライアントの初期化
    gcs_client = Gcs_client()
    bq_client = Bigquery_client()

    # CSVファイルの読み込み
    stock_names_df = pd.read_csv(STOCK_MAPPING_CSV, usecols=['code', 'name'])

    # 既存のGCSオブジェクトのリストを取得（重複防止用）
    existing_objects = set(gcs_client.list_all_objects(BUCKET_NAME))

    # 各銘柄のデータを取得し、GCSにアップロード
    for index, row in stock_names_df.loc[(stock_names_df.index >= 3970)].iterrows():
        stock_code = str(row['code']).strip()
        stock_name = row['name'].strip()
        ticker = f"{stock_code}.T"  # 東証の場合、ティッカーは通常「.T」が付加されます
        
        # 株価データの取得
        df = yf.download(ticker, start=START_DATE, end=END_DATE)

        if df.empty:
            print(f"No data found for {ticker}. Skipping...")
        else:
            # データフレームの前処理
            df.reset_index(inplace=True)
            df = df.rename(columns={
                'Date': 'Date',
                'Open': 'Open',
                'High': 'High',
                'Low': 'Low',
                'Close': 'Close',
                'Adj Close': 'Adj_Close',
                'Volume': 'Volume'
            })
            df['Stock_Code'] = stock_code
    
            # 必要なカラムのみを選択
            df = df[['Date', 'Stock_Code', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']]
            df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

            # Parquetファイルのパス設定
            local_file_name = f"{stock_code}.parquet"
            local_file_path = f"./output/{local_file_name}"
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
    
            # Parquet形式で保存
            df.to_parquet(local_file_path, engine='pyarrow', index=False)
    
            # GCSへのアップロード
            if local_file_name in existing_objects:
                print(f"File {local_file_name} already exists in GCS. Skipping upload.")
            else:
                gcs_client.upload_gcs(BUCKET_NAME, local_file_path, local_file_name)
    
            # BigQueryへのデータロード
            table_id = f"{PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}"
            source_uri = f"gs://{BUCKET_NAME}/{local_file_name}"
            job_config = bigquery.LoadJobConfig(
                source_format=bigquery.SourceFormat.PARQUET,
                write_disposition=bigquery.WriteDisposition.WRITE_APPEND  # 既存データに追加
            )
            
            load_job = bq_client.client.load_table_from_uri(
                source_uri,
                table_id,
                job_config=job_config
            )
            
            load_job.result()  # ジョブの完了を待つ
            print(f"Loaded data into BigQuery table {table_id} from {source_uri}.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
STOCK_MAPPING_CSV = '../stock_code_name_mapping.csv'
stock_names_df = pd.read_csv(STOCK_MAPPING_CSV, usecols=['code', 'name'])

stock_names_df[stock_names_df['code']==8129]