In [1]:
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2.service_account import Credentials
import geopy.distance
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [3]:
BIGQUERY_CREDENTIALS_FILE_PATH = r"D:\data_engineer\dev_TIR_group2\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = BIGQUERY_CREDENTIALS_FILE_PATH
BQ_CLIENT = bigquery.Client()


In [2]:
def query_bq_to_df(client: bigquery.Client,sql_query:str) -> pd.DataFrame:
    try:
        query_job = client.query(sql_query)
        return query_job.to_dataframe()  # Convert result to DataFrame
    except Exception as e:
        raise Exception(f"Failed to query bigquery table, reason: {e}")

In [7]:
sql_query_realtime ="""  
    SELECT * FROM `ETL_FACT.FACT_bike_realtime`
"""
df = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_realtime)

In [15]:
sql_query_bike_station ="""  
    SELECT * FROM `ETL_DIM.DIM_bike_station`
"""
df_bike_station = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_bike_station)

In [16]:
sql_query_youbike_mrt_distance ="""  
    SELECT * FROM `ETL_FACT.FACT_youbike_mrt_distance`
"""
df_youbike_mrt_distance = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_youbike_mrt_distance)

In [35]:
sql_query_youbike_mrt_groupby ="""  
    SELECT * FROM
        (SELECT  
            bike_station_id,
            mrt_station_id,
            distance,
            ROW_NUMBER() OVER (PARTITION BY bike_station_id ORDER BY distance ) AS dis_rank 
        FROM `ETL_FACT.FACT_youbike_mrt_distance`) AS t
    WHERE t.dis_rank<=3
    ORDER BY t.bike_station_id,t.dis_rank;
"""
df_youbike_mrt_groupby = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_youbike_mrt_groupby)

In [36]:
df_youbike_mrt_groupby

Unnamed: 0,bike_station_id,mrt_station_id,distance,dis_rank
0,500101001,BR08,0.015941,1
1,500101001,R05,0.813848,2
2,500101001,BR09,0.813848,3
3,500101002,BR08,0.053192,1
4,500101002,R05,0.857233,2
...,...,...,...,...
4240,500119090,G08,0.785118,2
4241,500119090,BR08,0.942211,3
4242,500119091,BR08,0.890423,1
4243,500119091,BR07,1.048145,2


In [60]:
sql_query_mrt_history ="""  
SELECT t_enter.date,t_enter.hour,t_enter.mrt_station,t_enter.enter_num,t_exit.exit_num
FROM
    (SELECT
        date,
        hour,
        mrt_station_name_enter AS mrt_station,
        SUM(visitors_num) AS enter_num
    FROM 
        `MRT_history.ODS_MRT_history_usage`
    WHERE DATE(date) >= '2024-01-01'
    GROUP BY date,hour,mrt_station_name_enter) AS t_enter
FULL OUTER JOIN
    (SELECT
        date,
        hour,
        mrt_station_name_exit AS mrt_station,
        SUM(visitors_num) AS exit_num
    FROM 
        `MRT_history.ODS_MRT_history_usage`
    WHERE DATE(date) >= '2024-01-01'
    GROUP BY date,hour,mrt_station_name_exit) AS t_exit
ON t_enter.date = t_exit.date AND t_enter.hour = t_exit.hour AND t_enter.mrt_station = t_exit.mrt_station
ORDER BY t_enter.date,t_enter.hour,t_enter.mrt_station DESC;
"""
df_mrt_his_groupby = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_mrt_history)

In [61]:
df_mrt_his_groupby

Unnamed: 0,date,hour,mrt_station,enter_num,exit_num
0,2024-01-01,0,龍山寺,148,197
1,2024-01-01,0,麟光,10,38
2,2024-01-01,0,頭前庄,19,22
3,2024-01-01,0,頂溪,82,230
4,2024-01-01,0,頂埔,10,104
...,...,...,...,...,...
225847,2024-03-31,23,三重國小,73,307
225848,2024-03-31,23,三重,84,101
225849,2024-03-31,23,三民高中,65,258
225850,2024-03-31,23,三和國中,60,258


In [53]:
################################update APR data
import requests
from io import StringIO
import re
def E_mrt_usage_history_one_month(url: str):
    response = requests.get(url=url)
    StringIO_df = StringIO(response.content.decode("utf-8-sig"))
    df = pd.read_csv(StringIO_df)
    pattern = re.compile(r"[A-Za-z]+")
    df["進站"] = df["進站"].str.replace(pattern, "", regex=True)
    df["出站"] = df["出站"].str.replace(pattern, "", regex=True)
    df.rename(columns={
        "日期": "date",
        "時段": "hour",
        "進站": "mrt_station_name_enter",
        "出站": "mrt_station_name_exit",
        "人次": "visitors_num"
    }, inplace=True)
    print(f"T_mrt_usage_history finished")
    return (df)

url="http://tcgmetro.blob.core.windows.net/stationod/%E8%87%BA%E5%8C%97%E6%8D%B7%E9%81%8B%E6%AF%8F%E6%97%A5%E5%88%86%E6%99%82%E5%90%84%E7%AB%99OD%E6%B5%81%E9%87%8F%E7%B5%B1%E8%A8%88%E8%B3%87%E6%96%99_202404.csv"
df_mrt_his_202404 = E_mrt_usage_history_one_month(url=url)

T_mrt_usage_history finished


In [58]:
df_mrt_his_202404

Unnamed: 0,date,hour,mrt_station_name_enter,mrt_station_name_exit,visitors_num
0,2024-04-01,0,松山機場,松山機場,0
1,2024-04-01,0,松山機場,中山國中,0
2,2024-04-01,0,松山機場,南京復興,0
3,2024-04-01,0,松山機場,忠孝復興,0
4,2024-04-01,0,松山機場,大安,0
...,...,...,...,...,...
8096755,2024-04-30,23,新北產業園區,徐匯中學,0
8096756,2024-04-30,23,新北產業園區,三和國中,0
8096757,2024-04-30,23,新北產業園區,三重國小,0
8096758,2024-04-30,23,新北產業園區,迴龍,3


In [39]:
sql_query_youbike_mrt_groupby ="""  
SELECT
    bike_station_id,
    mrt_station_id,
    distance,
    ROW_NUMBER() OVER (PARTITION BY bike_station_id ORDER BY distance) AS dis_rank
FROM
    `ETL_FACT.FACT_youbike_mrt_distance`
QUALIFY
    dis_rank <= 3
ORDER BY 
    bike_station_id,dis_rank;
"""
df_youbike_mrt_groupby = query_bq_to_df(client=BQ_CLIENT,sql_query=sql_query_youbike_mrt_groupby)

In [44]:
df_youbike_mrt_groupby

Unnamed: 0,bike_station_id,mrt_station_id,distance,dis_rank
0,500101001,BR08,0.015941,1
1,500101001,R05,0.813848,2
2,500101001,BR09,0.813848,3
3,500101002,BR08,0.053192,1
4,500101002,R05,0.857233,2
...,...,...,...,...
4240,500119090,G08,0.785118,2
4241,500119090,BR08,0.942211,3
4242,500119091,BR08,0.890423,1
4243,500119091,BR07,1.048145,2


In [43]:
df_youbike_mrt_groupby

Unnamed: 0,bike_station_id,mrt_station_id,distance,dis_rank
0,500101001,BR08,0.015941,1
1,500101001,R05,0.813848,2
2,500101001,BR09,0.813848,3
3,500101002,BR08,0.053192,1
4,500101002,R05,0.857233,2
...,...,...,...,...
4240,500119090,G08,0.785118,2
4241,500119090,BR08,0.942211,3
4242,500119091,BR08,0.890423,1
4243,500119091,BR07,1.048145,2


In [17]:
df_youbike_mrt_distance

Unnamed: 0,bike_station_id,mrt_station_id,distance
0,500107008,O02,11.193488
1,500107008,BR15,0.093802
2,500107008,BR12,2.800712
3,500107008,BR14,1.001567
4,500107008,R10,5.664833
...,...,...,...
171210,500101119,R26,13.212745
171211,500101119,BL10,2.436063
171212,500101119,O53,7.738212
171213,500101119,O52,6.889340
