In [45]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import matplotlib.pyplot as plt

# sorgt dafür, dass Plots im Notebook angezeigt werden
%matplotlib inline

# Lesbarkeit in der Exploration erhöhen
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 140)

In [46]:
# Zelle 1: Imports & Config
from google.cloud import bigquery
import logging

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# --- KONFIGURATION ---
PROJECT_ID = "taxi-bi-project"
# Trennung: Wo liegen die Daten? Wohin sollen sie?
SOURCE_DATASET = "staging"    # Hier liegen staging_yellow, staging_green, fhv
TARGET_DATASET = "canonical"   # Hier soll die saubere Tabelle hin

BQ_LOCATION = "EU"
TARGET_TABLE = "canonical_unified_taxi"

# Tabellennamen (Basierend auf deiner Exploration)
STAGING_YELLOW = "yellow_staging_unified"
STAGING_GREEN = "green_staging_unified"
STAGING_FHV = "fhv_staging_unified" # <--- Name aus deinem Notebook übernommen

# Client initialisieren
client = bigquery.Client(project=PROJECT_ID)

# WICHTIG: Das Ziel-Dataset 'canonical' muss existieren!
# Wir erstellen es hier automatisch (Location 'EU' basierend auf deinem Fehlerlog)
dataset_id = f"{PROJECT_ID}.{TARGET_DATASET}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "EU" # Falls deine Daten in US liegen, ändere dies auf "US"

try:
    client.create_dataset(dataset, exists_ok=True)
    print(f"Dataset '{TARGET_DATASET}' ist bereit.")
except Exception as e:
    print(f"Hinweis beim Dataset-Erstellen: {e}")

# Ziel-Referenz aktualisieren
table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{TARGET_TABLE}"
print(f"Ziel-Tabelle wird sein: {table_ref}")

Dataset 'canonical' ist bereit.
Ziel-Tabelle wird sein: taxi-bi-project.canonical.canonical_unified_taxi


In [47]:
# Zelle 3: Reset - Alte Tabelle löschen
from google.api_core.exceptions import NotFound
try:
    client.delete_table(table_ref)
    print(f"Tabelle {table_ref} wurde gelöscht (falls sie existierte).")
except NotFound:
    print("Tabelle existierte noch nicht. Alles okay.")

Tabelle taxi-bi-project.canonical.canonical_unified_taxi wurde gelöscht (falls sie existierte).


In [48]:
# Zelle 4: DDL - Tabelle im Ziel-Dataset erstellen
def create_canonical_table():
    schema = [
        bigquery.SchemaField("unique_row_id", "STRING", description="Generierter Key"),
        bigquery.SchemaField("vendor_id", "STRING"), 
        bigquery.SchemaField("rate_code_id", "INTEGER"),
        bigquery.SchemaField("payment_type", "INTEGER"),
        bigquery.SchemaField("pu_location_id", "INTEGER"),
        bigquery.SchemaField("do_location_id", "INTEGER"),
        bigquery.SchemaField("store_and_fwd_flag", "STRING"),
        bigquery.SchemaField("pickup_datetime", "TIMESTAMP"),
        bigquery.SchemaField("dropoff_datetime", "TIMESTAMP"),
        bigquery.SchemaField("passenger_count", "INTEGER"),
        bigquery.SchemaField("trip_distance", "FLOAT"),
        bigquery.SchemaField("fare_amount", "NUMERIC"),
        bigquery.SchemaField("extra", "NUMERIC"),
        bigquery.SchemaField("mta_tax", "NUMERIC"),
        bigquery.SchemaField("tip_amount", "NUMERIC"),
        bigquery.SchemaField("tolls_amount", "NUMERIC"),
        bigquery.SchemaField("improvement_surcharge", "NUMERIC"),
        bigquery.SchemaField("congestion_surcharge", "NUMERIC"),
        bigquery.SchemaField("airport_fee", "NUMERIC"),
        bigquery.SchemaField("total_amount", "NUMERIC"),
        bigquery.SchemaField("source_type", "STRING"),
        bigquery.SchemaField("valid_from", "TIMESTAMP"),
        bigquery.SchemaField("validation_status", "STRING")
    ]

    table = bigquery.Table(table_ref, schema=schema)
    
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY, 
        field="pickup_datetime"
    )
    table.clustering_fields = ["vendor_id", "source_type"]

    try:
        client.create_table(table)
        print(f"Tabelle {table_ref} erfolgreich angelegt.")
    except Exception as e:
        print(f"Konnte Tabelle nicht anlegen (vielleicht existiert sie schon?): {e}")

In [49]:
# Zelle 5: ELT Transformation Query (Source -> Target)
def run_transformation_query():
    print("Bereite Transformation vor...")
    
    START_DATE = '2015-01-01' 
    END_DATE = '2026-01-01'

    # WICHTIG: Hier nutzen wir SOURCE_DATASET für das 'FROM'
    query = f"""
    -- (A) YELLOW
    WITH yellow_clean AS (
        SELECT 'yellow' as source_type, CAST(VendorID AS STRING) as vendor_id,
        CAST(tpep_pickup_datetime AS TIMESTAMP) as pickup_datetime,
        CAST(tpep_dropoff_datetime AS TIMESTAMP) as dropoff_datetime,
        IFNULL(passenger_count, 1) as passenger_count, trip_distance, IFNULL(RatecodeID, 99) as rate_code_id,
        IFNULL(store_and_fwd_flag, 'N') as store_and_fwd_flag, PULocationID as pu_location_id, DOLocationID as do_location_id,
        payment_type,
        GREATEST(IFNULL(fare_amount, 0), 0) as fare_amount, GREATEST(IFNULL(extra, 0), 0) as extra,
        GREATEST(IFNULL(mta_tax, 0), 0) as mta_tax, GREATEST(IFNULL(tip_amount, 0), 0) as tip_amount,
        GREATEST(IFNULL(tolls_amount, 0), 0) as tolls_amount, GREATEST(IFNULL(improvement_surcharge, 0), 0) as improvement_surcharge,
        GREATEST(IFNULL(congestion_surcharge, 0), 0) as congestion_surcharge, GREATEST(IFNULL(airport_fee, 0), 0) as airport_fee
        FROM `{PROJECT_ID}.{SOURCE_DATASET}.{STAGING_YELLOW}`
    ),
    -- (B) GREEN
    green_clean AS (
        SELECT 'green' as source_type, CAST(VendorID AS STRING) as vendor_id,
        CAST(lpep_pickup_datetime AS TIMESTAMP) as pickup_datetime,
        CAST(lpep_dropoff_datetime AS TIMESTAMP) as dropoff_datetime,
        IFNULL(passenger_count, 1) as passenger_count, trip_distance, IFNULL(RatecodeID, 99) as rate_code_id,
        IFNULL(store_and_fwd_flag, 'N') as store_and_fwd_flag, PULocationID as pu_location_id, DOLocationID as do_location_id,
        payment_type,
        GREATEST(IFNULL(fare_amount, 0), 0) as fare_amount, GREATEST(IFNULL(extra, 0), 0) as extra,
        GREATEST(IFNULL(mta_tax, 0), 0) as mta_tax, GREATEST(IFNULL(tip_amount, 0), 0) as tip_amount,
        GREATEST(IFNULL(tolls_amount, 0), 0) as tolls_amount, GREATEST(IFNULL(improvement_surcharge, 0), 0) as improvement_surcharge,
        GREATEST(IFNULL(congestion_surcharge, 0), 0) as congestion_surcharge, 0 as airport_fee
        FROM `{PROJECT_ID}.{SOURCE_DATASET}.{STAGING_GREEN}`
    ),
    -- (C) FHV
    fhv_clean AS (
        SELECT 'fhv' as source_type, dispatching_base_num as vendor_id,
        CAST(pickup_datetime AS TIMESTAMP) as pickup_datetime,
        CAST(dropOff_datetime AS TIMESTAMP) as dropoff_datetime,
        1 as passenger_count, NULL as trip_distance, 99 as rate_code_id, 'N' as store_and_fwd_flag,
        PULocationID as pu_location_id, DOLocationID as do_location_id, NULL as payment_type,
        NULL as fare_amount, NULL as extra, NULL as mta_tax, NULL as tip_amount,
        NULL as tolls_amount, NULL as improvement_surcharge, NULL as congestion_surcharge, NULL as airport_fee
        FROM `{PROJECT_ID}.{SOURCE_DATASET}.{STAGING_FHV}`
    ),
    -- (D) Union
    final_logic AS (
        SELECT * FROM yellow_clean UNION ALL SELECT * FROM green_clean UNION ALL SELECT * FROM fhv_clean
    )

    -- (E) Write to Target
    SELECT 
        FARM_FINGERPRINT(CONCAT(source_type, CAST(pickup_datetime AS STRING), IFNULL(vendor_id, ''))) as unique_row_id,
        *,
        (IFNULL(fare_amount,0) + IFNULL(extra,0) + IFNULL(mta_tax,0) + IFNULL(tolls_amount,0) + 
         IFNULL(improvement_surcharge,0) + IFNULL(congestion_surcharge,0) + IFNULL(airport_fee,0)) as total_amount,
        CURRENT_TIMESTAMP() as valid_from, 
        'VALID' as validation_status
    FROM final_logic
    WHERE 
      pickup_datetime < dropoff_datetime 
      AND (trip_distance IS NULL OR (trip_distance >= 0 AND trip_distance < 1000))
      AND pickup_datetime >= TIMESTAMP('{START_DATE}') 
      AND pickup_datetime < TIMESTAMP('{END_DATE}')

    QUALIFY ROW_NUMBER() OVER (PARTITION BY source_type, pickup_datetime, dropoff_datetime, vendor_id ORDER BY pickup_datetime) = 1
    """

    # HIER nutzen wir table_ref (welches auf TARGET_DATASET zeigt)
    job_config = bigquery.QueryJobConfig(
        destination=table_ref,
        write_disposition="WRITE_TRUNCATE"
    )

    try:
        print("Starte Query auf BigQuery...")
        query_job = client.query(query, job_config=job_config)
        query_job.result()
        print(f"ERFOLG! Daten wurden in {table_ref} gespeichert.")
    except Exception as e:
        print(f"Fehler im Query Job: {e}")

In [50]:
# Zelle 6: Pipeline ausführen
create_canonical_table()
run_transformation_query()

Tabelle taxi-bi-project.canonical.canonical_unified_taxi erfolgreich angelegt.
Bereite Transformation vor...
Starte Query auf BigQuery...
ERFOLG! Daten wurden in taxi-bi-project.canonical.canonical_unified_taxi gespeichert.


In [51]:
# Zelle 7: Ergebnis prüfen
print("--- Check Results ---")
try:
    # Zählen
    count_query = f"SELECT source_type, count(*) as cnt FROM `{table_ref}` GROUP BY 1"
    df_count = client.query(count_query).to_dataframe()
    print("\nZeilen pro Typ:")
    print(df_count)
    
    # Vorschau
    print("\nVorschau (5 Zeilen):")
    df_sample = client.query(f"SELECT * FROM `{table_ref}` LIMIT 5").to_dataframe()
    display(df_sample) # oder print(df_sample)
except Exception as e:
    print(f"Konnte Daten nicht lesen: {e}")

--- Check Results ---





Zeilen pro Typ:
  source_type        cnt
0      yellow  101134534
1       green   67736493
2         fhv  514439669

Vorschau (5 Zeilen):




Unnamed: 0,unique_row_id,source_type,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,airport_fee,total_amount,valid_from,validation_status
0,7646351569098648074,green,2,2015-12-21 16:02:56+00:00,2015-12-21 16:04:14+00:00,1,0.0,1,N,193,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-23 12:15:02.305718+00:00,VALID
1,8321490569331597801,green,2,2015-12-21 20:21:31+00:00,2015-12-21 20:23:40+00:00,1,0.44,1,N,49,61,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-23 12:15:02.305718+00:00,VALID
2,1769977943478882486,green,2,2015-12-21 10:15:16+00:00,2015-12-21 10:19:25+00:00,1,0.81,1,N,244,244,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-23 12:15:02.305718+00:00,VALID
3,-6282569189110701693,green,2,2015-12-21 22:41:35+00:00,2015-12-21 22:44:56+00:00,1,0.7,1,N,33,33,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-23 12:15:02.305718+00:00,VALID
4,7995879999250307598,green,2,2015-12-21 14:47:27+00:00,2015-12-21 14:47:34+00:00,2,0.0,1,N,247,247,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-12-23 12:15:02.305718+00:00,VALID
