In [1]:
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2.service_account import Credentials
import geopy.distance
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [7]:
BIGQUERY_CREDENTIALS_FILE_PATH = r"D:\data_engineer\dev_TIR_group2\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
# BIGQUERY_CREDENTIALS_FILE_PATH = r"C:\dev_TIR101\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = BIGQUERY_CREDENTIALS_FILE_PATH
BQ_CLIENT = bigquery.Client()

In [8]:
start_date = '2024-04-16'
end_date = '2024-04-30'

In [20]:
def query_bq_to_df(client: bigquery.Client,sql_query:str) -> pd.DataFrame:
    try:
        query_job = client.query(sql_query)
        return query_job.to_dataframe()  # Convert result to DataFrame
    except Exception as e:
        raise Exception(f"Failed to query bigquery table, reason: {e}")

In [18]:
def MART_bike_realtime_select_date_create(start_date :str,
                                            end_date :str,
                                            create_dataset_name: str,
                                            create_table_name: str,
                                            source_dataset_name:str,
                                            source_table_name: str, 
                                            client: bigquery.Client):
    """create MART table for bike_realtime station information"""
    query_job = client.query(
        f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT 
        `bike_station_id`,
        EXTRACT(DATE FROM `source_time`) AS date,
        EXTRACT(HOUR FROM `source_time`) AS hour,
        AVG(`aval_bike`) AS aval_bike_mean,
        AVG(`aval_bike`) AS aval_space_mean
    FROM `{source_dataset_name}.{source_table_name}`
    WHERE `source_time` BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') 
    GROUP BY bike_station_id , date , hour
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [36]:
def MART_bike_station_info_create(create_dataset_name: str,
                            create_table_name: str,
                            source_dataset_name_left:str,
                              source_table_name_left: str,
                              source_dataset_name_right:str,
                              source_table_name_right: str,  
                                 client: bigquery.Client):
    """create MART table for bike_realtime station information"""
    query_job = client.query(
        f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT 
      t1.* , 
      t2.station_name,
      t2.total_space,
      t2.lat,
      t2.lng
    FROM
      (SELECT 
        *
      FROM `{source_dataset_name_left}.{source_table_name_left}`) AS t1
    LEFT JOIN
      (SELECT
         *
      FROM `{source_dataset_name_right}.{source_table_name_right}`) AS t2
      ON t1.bike_station_id = t2.bike_station_id ;
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [19]:
MART_bike_realtime_select_date_create(start_date ='2024-04-16',
                             end_date = '2024-04-30',
                             create_dataset_name="ANDY_ETL_MART",
                              create_table_name="MART_youbike_bike_groupby_date_hour",
                              source_dataset_name = "ANDY_ETL_FACT",
                              source_table_name="FACT_bike_realtime", 
                                 client=BQ_CLIENT)

ANDY_ETL_MART.MART_youbike_bike_groupby_date_hour has been created


In [37]:
MART_bike_station_info_create(create_dataset_name="ANDY_ETL_MART",
                             create_table_name="MART_youbike_bike_realtime_and_info",
                             source_dataset_name_left="ANDY_ETL_MART",
                             source_table_name_left="MART_youbike_bike_groupby_date_hour",
                             source_dataset_name_right="ANDY_ETL_DIM",
                             source_table_name_right="DIM_bike_station",
                             client=BQ_CLIENT)

ANDY_ETL_MART.MART_youbike_bike_realtime_and_info has been created


In [21]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_youbike_bike_groupby_date_hour`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df

Unnamed: 0,bike_station_id,date,hour,aval_bike_mean,aval_space_mean
0,500101233,2024-04-16,22,0.0,0.0
1,500104098,2024-04-16,22,0.0,0.0
2,500112013,2024-04-16,22,0.0,0.0
3,500106020,2024-04-16,22,0.0,0.0
4,500119081,2024-04-16,22,0.0,0.0
...,...,...,...,...,...
438546,500113035,2024-04-29,20,43.0,43.0
438547,500119048,2024-04-29,20,45.0,45.0
438548,500104031,2024-04-29,20,47.0,47.0
438549,500104004,2024-04-29,20,49.0,49.0


In [35]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_youbike_bike_realtime_and_info`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df

Unnamed: 0,bike_station_id,date,hour,aval_bike_mean,aval_space_mean,station_name,total_space,lat,lng
0,500107008,2024-04-16,22,0.333333,0.333333,YouBike2.0_捷運劍南路站(2號出口),86,25.08401,121.55535
1,500107008,2024-04-16,23,0.666667,0.666667,YouBike2.0_捷運劍南路站(2號出口),86,25.08401,121.55535
2,500108032,2024-04-16,23,6.000000,6.000000,YouBike2.0_瑞陽公園,12,25.07777,121.57961
3,500108032,2024-04-16,22,3.666667,3.666667,YouBike2.0_瑞陽公園,12,25.07777,121.57961
4,500109056,2024-04-16,22,5.833333,5.833333,YouBike2.0_中和街455巷11弄,13,25.14319,121.49621
...,...,...,...,...,...,...,...,...,...
438546,500101119,2024-04-29,2,10.333333,10.333333,YouBike2.0_金杭公園,20,25.03325,121.52436
438547,500101119,2024-04-29,19,7.500000,7.500000,YouBike2.0_金杭公園,20,25.03325,121.52436
438548,500101119,2024-04-29,13,8.000000,8.000000,YouBike2.0_金杭公園,20,25.03325,121.52436
438549,500101119,2024-04-29,15,11.833333,11.833333,YouBike2.0_金杭公園,20,25.03325,121.52436
