In [1]:
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2.service_account import Credentials
import geopy.distance
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [2]:
# BIGQUERY_CREDENTIALS_FILE_PATH = r"D:\data_engineer\dev_TIR_group2\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
BIGQUERY_CREDENTIALS_FILE_PATH = r"C:\dev_TIR101\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = BIGQUERY_CREDENTIALS_FILE_PATH
BQ_CLIENT = bigquery.Client()

In [3]:
start_date = '2024-04-16'
end_date = '2024-04-30'

In [4]:
def query_bq_to_df(client: bigquery.Client,sql_query:str) -> pd.DataFrame:
    try:
        query_job = client.query(sql_query)
        return query_job.to_dataframe()  # Convert result to DataFrame
    except Exception as e:
        raise Exception(f"Failed to query bigquery table, reason: {e}")

In [5]:
def MART_bike_realtime_select_date_create(start_date :str,
                                            end_date :str,
                                            create_dataset_name: str,
                                            create_table_name: str,
                                            source_dataset_name:str,
                                            source_table_name: str, 
                                            client: bigquery.Client):
    """create MART table for bike_realtime station information"""
    query_job = client.query(
        f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT 
        `bike_station_id`,
        EXTRACT(DATE FROM `source_time`) AS date,
        EXTRACT(HOUR FROM `source_time`) AS hour,
        AVG(`aval_bike`) AS aval_bike_mean,
        AVG(`aval_bike`) AS aval_space_mean
    FROM `{source_dataset_name}.{source_table_name}`
    WHERE `source_time` BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') 
    GROUP BY bike_station_id , date , hour
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [19]:
MART_bike_realtime_select_date_create(start_date ='2024-04-16',
                             end_date = '2024-04-30',
                             create_dataset_name="ANDY_ETL_MART",
                              create_table_name="MART_youbike_bike_groupby_date_hour",
                              source_dataset_name = "ANDY_ETL_FACT",
                              source_table_name="FACT_bike_realtime", 
                                 client=BQ_CLIENT)

ANDY_ETL_MART.MART_youbike_bike_groupby_date_hour has been created


In [37]:
def MART_bike_station_info_create(
      create_dataset_name: str,
      create_table_name: str,
      source_dataset_name_left:str,
      source_table_name_left: str,
      source_dataset_name_right:str,
      source_table_name_right: str,
      time_table_dataset:str,
      time_table_table:str,
      client: bigquery.Client):
    """create MART table for bike_realtime with station information"""
    query_job = client.query(
        f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT 
      t1.* , 
      t2.station_name,
      t2.total_space,
      t2.lat,
      t2.lng,
      t3.day_of_week,
      t3.day_of_week_name
    FROM
      (SELECT 
        *
      FROM `{source_dataset_name_left}.{source_table_name_left}`) AS t1
    LEFT JOIN
      (SELECT
         *
      FROM `{source_dataset_name_right}.{source_table_name_right}`) AS t2
    ON t1.bike_station_id = t2.bike_station_id 
    LEFT JOIN
      (SELECT
         *
      FROM `{time_table_dataset}.{time_table_table}`) AS t3
    ON t1.date = t3.date 
      ;
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [38]:
MART_bike_station_info_create(create_dataset_name="ANDY_ETL_MART",
                             create_table_name="MART_youbike_bike_realtime_and_info",
                             source_dataset_name_left="ANDY_ETL_MART",
                             source_table_name_left="MART_youbike_bike_groupby_date_hour",
                             source_dataset_name_right="ANDY_ETL_DIM",
                             source_table_name_right="DIM_bike_station",
                             time_table_dataset="ANDY_ETL_DIM",
                             time_table_table="DIM_time_table",
                             client=BQ_CLIENT)

ANDY_ETL_MART.MART_youbike_bike_realtime_and_info has been created


In [13]:
def MART_youbike_min_distance_create(create_dataset_name:str,
                             create_table_name:str,
                             source_dataset_name:str,
                             source_table_name:str,
                             mrt_info_dataset_name:str,
                             mrt_info_table_name:str,
                             client:bigquery.Client):
    query_job = client.query(
    f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT t1.* ,t2.mrt_station_name
    FROM
        (SELECT
            `bike_station_id`,
            `mrt_station_id`,
            `distance`
        FROM
            (SELECT 
                `bike_station_id`,
                `mrt_station_id`,
                `distance`,
                ROW_NUMBER() OVER (PARTITION BY `bike_station_id` ORDER BY `distance`) AS `row_num`
            FROM `{source_dataset_name}.{source_table_name}`) 
        WHERE row_num=1) AS t1
    LEFT JOIN
        (SELECT 
            `mrt_station_id`,
            `station_name` AS `mrt_station_name`
        FROM `{mrt_info_dataset_name}.{mrt_info_table_name}`) AS t2
    ON t1.mrt_station_id = t2.mrt_station_id
        
    ;
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [17]:
MART_youbike_min_distance_create(create_dataset_name="ANDY_ETL_MART",
                                 create_table_name="MART_youbike_mrt_distance",
                                 source_dataset_name="ETL_DIM",
                                 source_table_name="DIM_youbike_mrt_distance",
                                mrt_info_dataset_name="MRT_GCS_to_BQ_SRC_ODS_DIM",
                                mrt_info_table_name="DIM_MRT_static_data",
                                 client=BQ_CLIENT)

ANDY_ETL_MART.MART_youbike_mrt_distance has been created


In [30]:
def MART_MRT_history_groupby_create(
                            start_date :str,
                            end_date :str,
                            create_dataset_name:str,
                             create_table_name:str,
                             source_dataset_name:str,
                             source_table_name:str,
                             client:bigquery.Client):
    query_job = client.query(
    f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT t_enter.date,t_enter.hour,t_enter.mrt_station_name,t_enter.enter_num,t_exit.exit_num
    FROM
        (SELECT
            date,
            hour,
            mrt_station_name_enter AS mrt_station_name,
            SUM(visitors_num) AS enter_num
        FROM 
            `{source_dataset_name}.{source_table_name}`
        WHERE DATE(date) >= '{start_date}' AND  DATE(date) <='{end_date}'
        GROUP BY date,hour,mrt_station_name_enter) AS t_enter
    FULL OUTER JOIN
        (SELECT
            date,
            hour,
            mrt_station_name_exit AS mrt_station_name,
            SUM(visitors_num) AS exit_num
        FROM 
            `{source_dataset_name}.{source_table_name}`
        WHERE DATE(date) >= '{start_date}' AND  DATE(date) <='{end_date}'
        GROUP BY date,hour,mrt_station_name_exit) AS t_exit
    ON t_enter.date = t_exit.date AND t_enter.hour = t_exit.hour AND t_enter.mrt_station_name = t_exit.mrt_station_name
    ORDER BY t_enter.date,t_enter.hour,t_enter.mrt_station_name DESC;
    """)
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [31]:
MART_MRT_history_groupby_create(
    start_date="2024-04-01",
    end_date="2024-04-30",
    create_dataset_name="ANDY_ETL_MART",
    create_table_name="MART_mrt_history_2024_04_grby",
    source_dataset_name="MRT_history",
    source_table_name="ODS_MRT_history_usage",
    client=BQ_CLIENT)

ANDY_ETL_MART.MART_mrt_history_2024_04_grby has been created


In [35]:
def MART_bike_station_info_create(
        client: bigquery.Client,
        create_dataset_name: str,
        create_table_name: str,
        MART_dataset_name:str,
        MART_youbike_bike_realtime_and_info="MART_youbike_bike_realtime_and_info",
        MART_youbike_mrt_distance="MART_youbike_mrt_distance",
        MART_mrt_history_2024_04_grby ="MART_mrt_history_2024_04_grby"):
    """create MART table for bike_realtime with station information"""
    query_job = client.query(
        f"""
    CREATE OR REPLACE TABLE `{create_dataset_name}.{create_table_name}` AS
    SELECT 
      t1.* , 
      t2.mrt_station_id,
      t2.mrt_station_name,
      t2.distance,
      t3.enter_num,
      t3.exit_num
    FROM
      (SELECT 
        *
      FROM `{MART_dataset_name}.{MART_youbike_bike_realtime_and_info}`) AS t1
    LEFT JOIN
      (SELECT
         *
      FROM `{MART_dataset_name}.{MART_youbike_mrt_distance}`) AS t2
      ON t1.bike_station_id = t2.bike_station_id 
    LEFT JOIN
      (SELECT
         *
      FROM `{MART_dataset_name}.{MART_mrt_history_2024_04_grby}`) AS t3
      ON t2.mrt_station_name = t3.mrt_station_name
      AND t1.date = t3.date
      AND t1.hour = t1.hour
      ;
    """
    )
    query_job.result()
    print(f"{create_dataset_name}.{create_table_name} has been created")

In [36]:
MART_bike_station_info_create(
        client=BQ_CLIENT,
        create_dataset_name="ANDY_ETL_MART",
        create_table_name="MART_youbike_correlation_pipeline",
        MART_dataset_name="ANDY_ETL_MART",
        MART_youbike_bike_realtime_and_info="MART_youbike_bike_realtime_and_info",
        MART_youbike_mrt_distance="MART_youbike_mrt_distance",
        MART_mrt_history_2024_04_grby ="MART_mrt_history_2024_04_grby")

ANDY_ETL_MART.MART_youbike_correlation_pipeline has been created


In [21]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_youbike_bike_groupby_date_hour`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df

Unnamed: 0,bike_station_id,date,hour,aval_bike_mean,aval_space_mean
0,500101233,2024-04-16,22,0.0,0.0
1,500104098,2024-04-16,22,0.0,0.0
2,500112013,2024-04-16,22,0.0,0.0
3,500106020,2024-04-16,22,0.0,0.0
4,500119081,2024-04-16,22,0.0,0.0
...,...,...,...,...,...
438546,500113035,2024-04-29,20,43.0,43.0
438547,500119048,2024-04-29,20,45.0,45.0
438548,500104031,2024-04-29,20,47.0,47.0
438549,500104004,2024-04-29,20,49.0,49.0


In [39]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_youbike_bike_realtime_and_info`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df.head()

Unnamed: 0,bike_station_id,date,hour,aval_bike_mean,aval_space_mean,station_name,total_space,lat,lng,day_of_week,day_of_week_name
0,500105029,2024-04-16,23,25.333333,25.333333,YouBike2.0_一壽橋,28,24.97837,121.55548,3,Tuesday
1,500105029,2024-04-16,22,26.833333,26.833333,YouBike2.0_一壽橋,28,24.97837,121.55548,3,Tuesday
2,500112022,2024-04-16,23,13.333333,13.333333,YouBike2.0_三張犁,56,25.03452,121.5576,3,Tuesday
3,500112022,2024-04-16,22,14.666667,14.666667,YouBike2.0_三張犁,56,25.03452,121.5576,3,Tuesday
4,500106059,2024-04-16,22,0.833333,0.833333,YouBike2.0_中山堂,43,25.04412,121.51025,3,Tuesday


In [18]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_youbike_mrt_distance`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df.head()

Unnamed: 0,bike_station_id,mrt_station_id,distance,mrt_station_name
0,500105030,G03,1.459308,七張
1,500105034,G03,1.537141,七張
2,500105029,G03,1.300606,七張
3,500105088,G03,1.201386,七張
4,500103053,R11,0.106156,中山


In [24]:
sql_query = """SELECT * FROM `ANDY_ETL_MART.MART_mrt_history_2024_04_grby`"""

df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query)
df.head()

Unnamed: 0,date,hour,mrt_station,enter_num,exit_num
0,2024-04-01,0,龍山寺,144,153
1,2024-04-01,0,麟光,15,21
2,2024-04-01,0,頭前庄,10,35
3,2024-04-01,0,頂溪,85,219
4,2024-04-01,0,頂埔,13,100
