In [12]:
import pandas as pd
from google.cloud import bigquery
from IPython.display import display

In [13]:
# Projekt- und Dataset-Konfiguration
PROJECT_ID = "taxi-bi-project"
DATASET_ID = "staging"
BQ_LOCATION = "EU"   # falls du es so angelegt hast

# Vollständiges Dataset-Prefix, z.B. "taxi-bi-project.staging"
FULL_DATASET_PREFIX = f"{PROJECT_ID}.{DATASET_ID}"

# BigQuery Client initialisieren
bq_client = bigquery.Client(project=PROJECT_ID, location=BQ_LOCATION)

In [14]:
# Unsere drei zentralen Views in BigQuery
VIEWS = {
    "YELLOW": {
        "table": "yellow_staging_unified",
    },
    "GREEN": {
        "table": "green_staging_unified",
    },
    "FHV": {
        "table": "fhv_staging_unified",
    },
}

In [15]:
# Funktion für Metadaten (Zeilen- & Spaltenanzahl)
def get_view_metadata(table_name: str) -> dict:
    """
    Liefert für eine Tabelle:
    - Anzahl Zeilen
    - Anzahl Spalten
    """
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_name}"

    # Spaltenanzahl über Table-Schema
    table = bq_client.get_table(full_table_id)
    column_count = len(table.schema)

    # Zeilenanzahl über COUNT(*)
    query = f"SELECT COUNT(*) AS row_count FROM `{full_table_id}`"
    result = list(bq_client.query(query).result())[0]
    row_count = result.row_count

    return {
        "Table": table_name,
        "Rows": row_count,
        "Columns": column_count,
    }

In [16]:
# Metadaten für alle drei Views berechnen
metadata_rows = []

for alias, cfg in VIEWS.items():
    table_name = cfg["table"]
    meta = get_view_metadata(table_name)
    meta["View"] = alias
    metadata_rows.append(meta)

df_metadata = pd.DataFrame(metadata_rows).set_index("View")

df_metadata

Unnamed: 0_level_0,Table,Rows,Columns
View,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YELLOW,yellow_staging_unified,178439591,21
GREEN,green_staging_unified,68045597,22
FHV,fhv_staging_unified,783688849,9


In [17]:
# Schema je View anzeigen
def get_schema(table_name: str) -> pd.DataFrame:
    """
    Gibt das Schema einer Tabelle als DataFrame zurück:
    - column_name
    - data_type
    - is_nullable
    """
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_name}"

    query = f"""
    SELECT
      column_name,
      data_type,
      is_nullable
    FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.COLUMNS`
    WHERE table_name = '{table_name}'
    ORDER BY ordinal_position
    """
    df = bq_client.query(query).to_dataframe()
    return df

for alias, cfg in VIEWS.items():
    print(f"Schema für {alias} ({cfg['table']}):")
    display(get_schema(cfg["table"]))
    print("-" * 80)

Schema für YELLOW (yellow_staging_unified):




Unnamed: 0,column_name,data_type,is_nullable
0,VendorID,INT64,YES
1,tpep_pickup_datetime,TIMESTAMP,YES
2,tpep_dropoff_datetime,TIMESTAMP,YES
3,passenger_count,INT64,YES
4,trip_distance,FLOAT64,YES
5,RatecodeID,INT64,YES
6,store_and_fwd_flag,STRING,YES
7,PULocationID,INT64,YES
8,DOLocationID,INT64,YES
9,payment_type,INT64,YES


--------------------------------------------------------------------------------
Schema für GREEN (green_staging_unified):




Unnamed: 0,column_name,data_type,is_nullable
0,VendorID,INT64,YES
1,lpep_pickup_datetime,DATETIME,YES
2,lpep_dropoff_datetime,DATETIME,YES
3,store_and_fwd_flag,STRING,YES
4,RatecodeID,INT64,YES
5,PULocationID,INT64,YES
6,DOLocationID,INT64,YES
7,passenger_count,INT64,YES
8,trip_distance,FLOAT64,YES
9,fare_amount,FLOAT64,YES


--------------------------------------------------------------------------------
Schema für FHV (fhv_staging_unified):




Unnamed: 0,column_name,data_type,is_nullable
0,dispatching_base_num,STRING,YES
1,pickup_datetime,DATETIME,YES
2,dropOff_datetime,DATETIME,YES
3,PUlocationID,INT64,YES
4,DOlocationID,INT64,YES
5,SR_Flag,STRING,YES
6,Affiliated_base_number,STRING,YES
7,duplicate_flag,STRING,YES
8,missing_flag,STRING,YES


--------------------------------------------------------------------------------


In [18]:
# Datetime-Spalten pro View (Pickup-Zeit)
DATETIME_COLUMNS = {
    "YELLOW": "tpep_pickup_datetime",
    "GREEN": "lpep_pickup_datetime",
    "FHV": "pickup_datetime",
}

def get_temporal_range(table_name: str, datetime_col: str) -> dict:
    """
    Liefert das früheste und späteste Datum in einer Datetime-Spalte.
    """
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_name}"

    query = f"""
    SELECT
      MIN({datetime_col}) AS min_ts,
      MAX({datetime_col}) AS max_ts
    FROM `{full_table_id}`
    """
    result = list(bq_client.query(query).result())[0]
    return {
        "Table": table_name,
        "Datetime_Column": datetime_col,
        "Min_Timestamp": result.min_ts,
        "Max_Timestamp": result.max_ts,
    }


In [19]:
# Zeitliche Abdeckung für alle drei Views sammeln
temporal_rows = []

for alias, cfg in VIEWS.items():
    table_name = cfg["table"]
    datetime_col = DATETIME_COLUMNS[alias]

    tr = get_temporal_range(table_name, datetime_col)
    tr["View"] = alias
    temporal_rows.append(tr)

df_temporal = pd.DataFrame(temporal_rows).set_index("View")

df_temporal


Unnamed: 0_level_0,Table,Datetime_Column,Min_Timestamp,Max_Timestamp
View,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YELLOW,yellow_staging_unified,tpep_pickup_datetime,2001-01-01 00:02:08+00:00,2029-05-05 08:37:39+00:00
GREEN,green_staging_unified,lpep_pickup_datetime,2008-10-21 15:52:05,2081-06-24 17:40:37
FHV,fhv_staging_unified,pickup_datetime,2015-01-01 00:00:00,2025-07-31 23:59:59
