In [1]:
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2.service_account import Credentials
import geopy.distance
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [4]:
# BIGQUERY_CREDENTIALS_FILE_PATH = r"D:\data_engineer\dev_TIR_group2\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
BIGQUERY_CREDENTIALS_FILE_PATH = r"C:\dev_TIR101\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = BIGQUERY_CREDENTIALS_FILE_PATH
BQ_CLIENT = bigquery.Client()

In [5]:
mrt_sql_query="""  
    SELECT mrt_station_id,lat,lng
    FROM `MRT_GCS_to_BQ_SRC_ODS_DIM.DIM_MRT_static_data`
"""
bus_sql_query="""  
    SELECT  bus_station_id,lat,lng
    FROM `BUS_GCS_to_BQ_SRC_ODS_DIM.DIM_Bus_static_data`
"""
bike_sql_query="""  
    SELECT  bike_station_id,lat,lng 
    FROM `ETL_DIM.DIM_bike_station`
"""

def query_bq_to_df(client: bigquery.Client,sql_query:str) -> pd.DataFrame:
    try:
        query_job = client.query(sql_query)
        return query_job.to_dataframe()  # Convert result to DataFrame
    except Exception as e:
        raise Exception(f"Failed to query bigquery table, reason: {e}")
mrt_df = query_bq_to_df(client=BQ_CLIENT,sql_query=mrt_sql_query)
bus_df = query_bq_to_df(client=BQ_CLIENT,sql_query=bus_sql_query)
youbike_df = query_bq_to_df(client=BQ_CLIENT,sql_query=bike_sql_query)


In [6]:
def calculate_dis(df,location2):
    location1 = (df["lat"],df["lng"])
    return(geopy.distance.geodesic(location1, location2).kilometers)

youbike_df.apply(lambda df :calculate_dis(df,location2=(25.136900,121.459550)),axis=1)

0       11.879406
1       11.560791
2       12.168026
3       11.792979
4       11.522385
          ...    
1410    15.126657
1411    15.483675
1412    15.424026
1413    15.177171
1414    15.289883
Length: 1415, dtype: float64

In [8]:
youbike_df['district'] = youbike_df['district'].replace('臺大公館校區','大安區')
youbike_df['city_code']='TPE'

In [7]:
for i in tqdm(range(len(mrt_df))):
    mrt_station_id = mrt_df.loc[i,"mrt_station_id"]
    location_mrt = (mrt_df.loc[i,"lat"] , mrt_df.loc[i,"lng"])
    youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)

  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= youbike_df.apply(lambda df :calculate_dis(df,location_mrt),axis=1)
  youbike_df[mrt_station_id]= 

In [25]:
fact_youbike = youbike_df.melt(id_vars=["bike_station_id",	"lat","lng"],var_name="mrt_station_id",value_name="distance" )
fact_youbike = fact_youbike.loc[:,["bike_station_id","mrt_station_id","distance"]]

In [26]:
fact_youbike

Unnamed: 0,bike_station_id,mrt_station_id,distance
0,500107102,O02,6.986596
1,500107099,O02,7.208812
2,500107065,O02,6.809924
3,500107086,O02,11.338723
4,500107045,O02,6.757624
...,...,...,...
171210,500119045,O54,10.703496
171211,500119070,O54,11.054859
171212,500119048,O54,10.971054
171213,500119077,O54,10.791606


In [30]:
location1 = (25.0330, 121.5654)  # 台北 101 的經緯度
location2 = (25.0478, 121.5319)  # 台北車站的經緯度

# 計算地理距離
distance = geopy.distance.geodesic(location1, location2).kilometers
distance

3.7572683270119502

In [10]:
from bike_mrt_distance_pipeline import *

In [9]:
# BIGQUERY_CREDENTIALS_FILE_PATH = r"D:\data_engineer\dev_TIR_group2\Taipei-transit-data_hub\airflow\dags\harry_GCS_BigQuery_write_cred.json"
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = BIGQUERY_CREDENTIALS_FILE_PATH
# BQ_CLIENT = bigquery.Client()
from bike_mrt_distance_pipeline import *
mrt_df = get_mrt_df(client=BQ_CLIENT)
youbike_df = get_youbike_df(client=BQ_CLIENT)
youbike_mrt_distance = create_bike_mrt_distance(
    mrt_df=mrt_df, youbike_df=youbike_df)
youbike_mrt_distance.to_csv(
    "youbike_mrt_distance.csv", index=False, encoding="utf-8-sig")

# bus_df = get_bus_df(client=BQ_CLIENT)
# youbike_bus_distance = create_bike_bus_distance(
#     youbike_df=youbike_df, bus_df=bus_df)
# youbike_bus_distance.to_csv(
#     "youbike_bus_distance.csv", index=False, encoding="utf-8-sig")

  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_station_id] = youbike_df.apply(
  youbike_df[mrt_sta

In [13]:
def upload_df_to_bq(
    client: bigquery.Client,
    df: pd.DataFrame,
    dataset_name: str,
    table_name: str,
    schema,
    filetype: str = "parquet",
) -> bool:
    """
    Upload a pandas dataframe to bigquery.

    Args:
        client (bigquery.Client): The client to use to upload to bigquery.
        df (pd.DataFrame): The dataframe to upload.
        dataset_name (str): The name of the dataset to upload to.
        table_name (str): The name of the table to upload to.
        schema (List[bigquery.SchemaField], optional): The schema of the table to upload to. Default is None.
                                                        If None, use the default schema (automatic-detect).
        filetype (str): The type of the file to download. Default is "parquet". Can be "parquet" or "csv" or "jsonl".

    Returns:
        bool: True if the upload was successful, False otherwise.
    """
    dataset_id = client.dataset(dataset_name)
    table_id = dataset_id.table(table_name)

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )
    if filetype == "parquet":
        job_config.source_format = bigquery.SourceFormat.PARQUET
    elif filetype == "csv":
        job_config.source_format = bigquery.SourceFormat.CSV
    elif filetype == "jsonl":
        job_config.source_format = bigquery.SourceFormat.JSONL
    else:
        raise ValueError(
            f"Invalid filetype: {filetype}. Please specify 'parquet' or 'csv' or 'jsonl'."
        )
    if schema:
        job_config.schema = schema

    try:
        job = client.load_table_from_dataframe(
            df, table_id, job_config=job_config)
        job.result()  # Wait for the job to complete
        table = client.get_table(table_id)
        print(f"Table {table.table_id} created with {table.num_rows} rows.")
        return True
    except Exception as e:
        raise Exception(f"Failed to upload df to bigquery, reason: {e}")

In [19]:
youbike_mrt_distance.dtypes

bike_station_id      Int64
mrt_station_id      object
distance           float64
dtype: object

In [11]:
youbike_mrt_distance_schema = [
    bigquery.SchemaField("bike_station_id", "INT64"),
    bigquery.SchemaField("mrt_station_id", "STRING"),
    bigquery.SchemaField("distance", "FLOAT")
]
upload_df_to_bq(client=BQ_CLIENT,
    df=youbike_mrt_distance,
    dataset_name = "ETL_DIM",
    table_name = "DIM_youbike_mrt_distance",
    schema =youbike_mrt_distance_schema ,
    filetype= "csv",)


Table DIM_youbike_mrt_distance created with 171215 rows.


True

In [None]:
youbike_bus_distance_schema = [
    bigquery.SchemaField("bike_station_id", "INT64"),
    bigquery.SchemaField("bus_station_id", "STRING"),
    bigquery.SchemaField("distance", "FLOAT")
]
upload_df_to_bq(client=BQ_CLIENT,
    df=youbike_mrt_distance,
    dataset_name = "ETL_FACT",
    table_name = "FACT_youbike_bus_distance",
    schema =youbike_bus_distance_schema ,
    filetype= "csv",)



In [3]:
def get_data_from_BQ(client:bigquery.Client):
    query = client.query(
        """
        SELECT * FROM `ETL_FACT.FACT_bike_realtime`
        ORDER BY `source_time` DESC
        LIMIT 1000
        """
    )
    df = query.to_dataframe()
    return(df)
df = get_data_from_BQ(client=BQ_CLIENT)

In [4]:
df

Unnamed: 0,bike_station_id,aval_bike,aval_space,create_time,source_time
0,500111023,40,8,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
1,500110040,22,8,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
2,500113087,12,8,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
3,500108006,10,1,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
4,500104023,23,9,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
...,...,...,...,...,...
995,500108030,9,4,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
996,500106008,4,11,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
997,500107020,25,7,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
998,500103054,10,5,2024-05-17 00:05:54.701103+00:00,2024-05-16 23:58:23+00:00
