In [None]:
# Zelle 1: Setup & Config
from google.cloud import bigquery
from google.api_core.exceptions import NotFound

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline


In [None]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
SOURCE_DATASET = "staging"     # <--- Laut deinem Fehlerlog hei√üt es "staging"!
TARGET_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "canonical_unified_taxi"
ERROR_TABLE = "error_records"
LOG_TABLE = "etl_process_log"

client = bigquery.Client(project=PROJECT_ID)

try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{SOURCE_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{SOURCE_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{SOURCE_DATASET}' nicht finden!")
    raise e

# 2. Pr√ºfen, ob das Ziel-Dataset 'canonical' falsch liegt
target_dataset_id = f"{PROJECT_ID}.{TARGET_DATASET}"
try:
    tgt_ds = client.get_dataset(target_dataset_id)
    if tgt_ds.location != CORRECT_LOCATION:
        print(f"‚ö†Ô∏è KONFLIKT: Ziel '{TARGET_DATASET}' ist in '{tgt_ds.location}', muss aber nach '{CORRECT_LOCATION}'.")
        print("   L√∂sche falsches Dataset...")
        client.delete_dataset(target_dataset_id, delete_contents=True, not_found_ok=True)
        print("   Gel√∂scht. Wird neu erstellt.")
        tgt_ds = None
except NotFound:
    tgt_ds = None

# 3. Ziel-Dataset korrekt neu erstellen
if not tgt_ds:
    new_ds = bigquery.Dataset(target_dataset_id)
    new_ds.location = CORRECT_LOCATION # <--- Hier zwingen wir die richtige Region!
    client.create_dataset(new_ds)
    print(f"ZIEL ERSTELLT: Dataset '{TARGET_DATASET}' erfolgreich in Region '{CORRECT_LOCATION}' angelegt.")
else:
    print(f"OK: Ziel-Dataset liegt bereits korrekt in '{CORRECT_LOCATION}'.")

# Globale Variablen aktualisieren
table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{TARGET_TABLE}"
error_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{ERROR_TABLE}"
log_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{LOG_TABLE}"

print("\nReady. Bitte jetzt Zelle 3 ausf√ºhren.")

Python(78274) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


--- DIAGNOSE & REPARATUR ---
‚úÖ QUELLE GEFUNDEN: Dataset 'staging' liegt in Region: 'EU'
OK: Ziel-Dataset liegt bereits korrekt in 'EU'.

Ready. Bitte jetzt Zelle 3 ausf√ºhren.


In [16]:
# Zelle 2: Schema Definition (Update gem√§√ü Schema.csv)
def create_all_tables():
    # 1. Erweitertes Schema (Inkl. neuer Base-Nummern & Flags)
    base_schema = [
        # Metadaten
        bigquery.SchemaField("trip_id", "STRING", description="Unique ID"),
        bigquery.SchemaField("source_system", "STRING"),
        bigquery.SchemaField("load_date", "TIMESTAMP"),
        bigquery.SchemaField("vendor_id", "STRING"), 
        
        #Base Nummern
        bigquery.SchemaField("Affiliated_base_number", "STRING"),
        bigquery.SchemaField("dispatching_base_nummer", "STRING"),
        
        # Zeit & Ort (Namen zur√ºck ge√§ndert gem√§√ü Schema.csv)
        bigquery.SchemaField("pickup_datetime", "TIMESTAMP"),
        bigquery.SchemaField("dropoff_datetime", "TIMESTAMP"),
        bigquery.SchemaField("pickup_location_id", "INTEGER"), 
        bigquery.SchemaField("dropoff_location_id", "INTEGER"), 
        
        # Details
        bigquery.SchemaField("passenger_count", "INTEGER"),
        bigquery.SchemaField("trip_distance", "FLOAT"),
        bigquery.SchemaField("store_and_fwd_flag", "STRING"),
        
        # RateCode & TripType
        bigquery.SchemaField("RatecodeID", "INTEGER"),
        bigquery.SchemaField("Trip_type", "INTEGER"),
        
        bigquery.SchemaField("SR_Flag", "BOOLEAN", description="1=Shared, NULL=Non-Shared"), # War is_shared_ride
        
        # Finanzen (Basis)
        bigquery.SchemaField("fare_amount", "FLOAT"),
        bigquery.SchemaField("tip_amount", "FLOAT"),
        bigquery.SchemaField("total_amount", "FLOAT"),
        bigquery.SchemaField("payment_type", "INTEGER"),
        
        # Finanzen (Erweitert)
        bigquery.SchemaField("extra", "FLOAT"),
        bigquery.SchemaField("mta_tax", "FLOAT"),
        bigquery.SchemaField("tolls_amount", "FLOAT"),
        bigquery.SchemaField("improvement_surcharge", "FLOAT"),
        bigquery.SchemaField("congestion_surcharge", "FLOAT"),
        bigquery.SchemaField("Airport_fee", "FLOAT"), # Gro√ügeschrieben laut Schema
        bigquery.SchemaField("ehail_fee", "FLOAT"),   # NEU
        
        # Validation Flag
        bigquery.SchemaField("dq_issue_flag", "BOOLEAN") 
    ]

    # Error Tabelle Schema
    error_schema = base_schema + [bigquery.SchemaField("rejection_reason", "STRING")]

    tables_to_create = [
        (table_ref, base_schema),
        (error_table_ref, error_schema)
    ]

    for t_ref, t_schema in tables_to_create:
        try:
            # Clean Slate: Alte Version l√∂schen
            client.delete_table(t_ref, not_found_ok=True)
            
            t = bigquery.Table(t_ref, schema=t_schema)
            t.time_partitioning = bigquery.TimePartitioning(field="pickup_datetime")
            client.create_table(t)
            print(f"‚úÖ Tabelle aktualisiert: {t_ref.split('.')[-1]}")
        except Exception as e:
            print(f"‚ùå Fehler: {e}")

create_all_tables()

‚úÖ Tabelle aktualisiert: canonical_unified_taxi
‚úÖ Tabelle aktualisiert: error_records


In [None]:
# Zelle 3: ETL Pipeline (Final Update mit allen neuen Spalten)
def run_etl_split_logic():
    print(f"Starte ETL Job (Schema Update) in Region: {CORRECT_LOCATION}...") 
    
    # Clean Slate
    client.delete_table(table_ref, not_found_ok=True)
    client.delete_table(error_table_ref, not_found_ok=True)

    run_id = str(uuid.uuid4())
    start_ts = datetime.datetime.now(datetime.timezone.utc)
    START_DATE = '2015-01-01'
    END_DATE = '2026-01-01'

    query = f"""
    BEGIN
        CREATE TEMP TABLE temp_all_trips AS
        WITH raw_combined AS (
            -- (A) YELLOW
            SELECT 
                'YELLOW' as source, CAST(VendorID AS STRING) as vid,
                CAST(tpep_pickup_datetime AS TIMESTAMP) as t_pick, CAST(tpep_dropoff_datetime AS TIMESTAMP) as t_drop,
                PULocationID as loc_pu, DOLocationID as loc_do,
                IFNULL(passenger_count, 1) as pax, trip_distance as dist, IFNULL(store_and_fwd_flag, 'N') as flag,
                
                -- Ratecode (Yellow hat ihn)
                IFNULL(RatecodeID, 99) as ratecode,
                NULL as trip_type, -- Yellow hat meist keinen expliziten TripType
                
                -- Base Nummern (Yellow hat keine)
                NULL as aff_base, NULL as disp_base,
                
                -- Finanzen
                IFNULL(payment_type, 0) as pay_type, FALSE as shared,
                GREATEST(IFNULL(fare_amount, 0), 0) as fare, 
                GREATEST(IFNULL(tip_amount, 0), 0) as tip, 
                GREATEST(IFNULL(total_amount, 0), 0) as total,
                GREATEST(IFNULL(extra, 0), 0) as extra,
                GREATEST(IFNULL(mta_tax, 0), 0) as mta,
                GREATEST(IFNULL(tolls_amount, 0), 0) as tolls,
                GREATEST(IFNULL(improvement_surcharge, 0), 0) as imp_sur,
                GREATEST(IFNULL(congestion_surcharge, 0), 0) as cong_sur,
                GREATEST(IFNULL(airport_fee, 0), 0) as air_fee,
                NULL as ehail -- Yellow hat keine ehail fee
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.yellow_staging_unified`
            
            UNION ALL
            
            -- (B) GREEN
            SELECT 
                'GREEN', CAST(VendorID AS STRING),
                CAST(lpep_pickup_datetime AS TIMESTAMP), CAST(lpep_dropoff_datetime AS TIMESTAMP),
                PULocationID, DOLocationID,
                IFNULL(passenger_count, 1), trip_distance, IFNULL(store_and_fwd_flag, 'N'),
                
                IFNULL(RatecodeID, 99),
                CAST(trip_type AS INT64), -- Green hat Trip Type
                
                NULL, NULL, -- Base Nummern
                
                IFNULL(payment_type, 0), FALSE,
                GREATEST(IFNULL(fare_amount, 0), 0), 
                GREATEST(IFNULL(tip_amount, 0), 0), 
                GREATEST(IFNULL(total_amount, 0), 0),
                GREATEST(IFNULL(extra, 0), 0),
                GREATEST(IFNULL(mta_tax, 0), 0),
                GREATEST(IFNULL(tolls_amount, 0), 0),
                GREATEST(IFNULL(improvement_surcharge, 0), 0),
                GREATEST(IFNULL(congestion_surcharge, 0), 0),
                0, -- Airport Fee oft nicht in Source
                GREATEST(IFNULL(ehail_fee, 0), 0) -- Green hat ehail fee
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.green_staging_unified`

            UNION ALL

            -- (C) FHV
            SELECT 
                'FHV', dispatching_base_num, -- Nutzt Base als VendorID
                CAST(pickup_datetime AS TIMESTAMP), CAST(dropOff_datetime AS TIMESTAMP),
                PULocationID, DOLocationID,
                NULL, NULL, 'N', -- Pax & Dist NULL
                
                99 as ratecode, -- FHV hat keinen Ratecode -> Unknown
                NULL as trip_type,
                
                -- NEU: Mapping der Base Nummern
                Affiliated_base_number as aff_base,
                dispatching_base_num as disp_base,
                
                0 as pay_type, -- Unknown
                IF(CAST(SR_Flag AS STRING)='1', TRUE, FALSE), -- SR_Flag
                
                -- Alle Finanzen NULL
                NULL, NULL, NULL, 
                NULL, NULL, NULL, NULL, NULL, NULL, NULL 
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.fhv_staging_unified`
        )
        
        SELECT
            FARM_FINGERPRINT(CONCAT(source, CAST(t_pick AS STRING), IFNULL(vid,''))) as trip_id,
            source as source_system,
            CURRENT_TIMESTAMP() as load_date,
            vid as vendor_id,
            aff_base as Affiliated_base_number,
            disp_base as dispatching_base_nummer,
            t_pick as pickup_datetime,
            t_drop as dropoff_datetime,
            loc_pu as PULocationID,
            loc_do as DOLocationID,
            pax as passenger_count,
            dist as trip_distance,
            flag as store_and_fwd_flag,
            ratecode as RatecodeID,
            trip_type as Trip_type,
            shared as SR_Flag,
            fare as fare_amount,
            tip as tip_amount,
            total as total_amount,
            pay_type as payment_type,
            extra, mta as mta_tax, tolls as tolls_amount,
            imp_sur as improvement_surcharge, cong_sur as congestion_surcharge,
            air_fee as Airport_fee, ehail as ehail_fee,
            
            -- DQ Validation (Flags setzen)
            CASE 
                -- 1. Unm√∂gliche Distanzen
                WHEN dist > 500 THEN TRUE
                
                -- 2. Zu viele Passagiere (au√üer Group Ride)
                WHEN source IN ('YELLOW', 'GREEN') AND pax > 5 THEN TRUE
                
                -- 3. Verd√§chtiges Trinkgeld bei Barzahlung
                WHEN pay_type = 2 AND tip > 10 THEN TRUE
                
                -- 4. Airport Fee an falschen Orten (132=JFK, 138=LGA)
                WHEN air_fee > 0 AND (loc_pu NOT IN (132, 138) AND loc_do NOT IN (132, 138)) THEN TRUE
                
                -- 5. 0-Sekunden-Fahrten (aber mit Umsatz)
                WHEN t_pick = t_drop AND total > 0 THEN TRUE 
                
                -- 6.  Ung√ºltige Zonen-IDs (Range Check 1-265)
                WHEN (loc_pu IS NOT NULL AND (loc_pu < 1 OR loc_pu > 265)) OR 
                     (loc_do IS NOT NULL AND (loc_do < 1 OR loc_do > 265)) THEN TRUE
                     
                ELSE FALSE 
            END as dq_issue_flag,

            -- Status Split (Valid vs Error)
            CASE
                WHEN t_pick IS NULL OR t_drop IS NULL THEN 'Missing Timestamps'
                WHEN t_pick > t_drop THEN 'Negative Duration'
                WHEN t_pick = t_drop AND (total IS NULL OR total = 0) THEN 'Zero Duration/No Revenue'
                WHEN t_pick < TIMESTAMP('{START_DATE}') THEN 'Date too old'
                WHEN t_pick > CURRENT_TIMESTAMP() THEN 'Future Date'
                ELSE 'VALID'
            END as row_status

        FROM raw_combined;

        CREATE OR REPLACE TABLE `{table_ref}`
        PARTITION BY DATE(pickup_datetime)
        CLUSTER BY source_system, vendor_id
        AS SELECT * EXCEPT(row_status) FROM temp_all_trips WHERE row_status = 'VALID';

        CREATE OR REPLACE TABLE `{error_table_ref}`
        PARTITION BY DATE(pickup_datetime)
        AS SELECT * EXCEPT(row_status), row_status as rejection_reason FROM temp_all_trips WHERE row_status != 'VALID';
    END;
    """

    job_config = bigquery.QueryJobConfig()
    try:
        query_job = client.query(query, location=CORRECT_LOCATION, job_config=job_config)
        query_job.result()
        
        cnt_valid = client.get_table(table_ref).num_rows
        print(f"‚úÖ ETL Fertig! {cnt_valid} Trips geladen (Neues Schema).")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

run_etl_split_logic()

Starte ETL Job (Schema Update) in Region: EU...
‚úÖ ETL Fertig! 688622858 Trips geladen (Neues Schema).


In [22]:
# Zelle 4: Quality Check
print("--- CLEAN DATA SAMPLE ---")
print(client.query(f"SELECT * FROM `{table_ref}` LIMIT 3").to_dataframe().T)

print("\n--- ERROR DATA SAMPLE  ---")
try:
    err_df = client.query(f"SELECT rejection_reason, count(*) as cnt FROM `{error_table_ref}` GROUP BY 1").to_dataframe()
    print(err_df)
except:
    print("Keine Fehler gefunden (Tabelle leer).")

--- CLEAN DATA SAMPLE ---




                                                        0                                 1                                 2
trip_id                              -6475556117126832522               3713405437082011635              -4842616212638726736
source_system                                         FHV                               FHV                               FHV
load_date                2025-12-30 11:01:42.262522+00:00  2025-12-30 11:01:42.262522+00:00  2025-12-30 11:01:42.262522+00:00
vendor_id                                          B00531                            B00531                            B00531
Affiliated_base_number                               None                              None                              None
dispatching_base_nummer                            B00531                            B00531                            B00531
pickup_datetime                 2020-12-19 12:31:20+00:00         2020-12-19 13:10:42+00:00         2020-12-19 16:11:5



In [20]:
# Zelle 5: Data Quality Check (Stichproben pro System)
import pandas as pd

def check_data_samples():
    print(f"--- üîç DATA INSPECTION (Tabelle: {table_ref}) ---\n")
    
    # 1. YELLOW CHECK ("Cellos")
    # Fokus: Haben sie Ratecode? Sind Finanzen da?
    print("üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):")
    sql_yellow = f"""
    SELECT 
        source_system, vendor_id, pickup_datetime, 
        RatecodeID, payment_type, 
        fare_amount, tip_amount, total_amount, 
        dq_issue_flag
    FROM `{table_ref}`
    WHERE source_system = 'YELLOW'
    LIMIT 5
    """
    try:
        df_y = client.query(sql_yellow).to_dataframe()
        display(df_y) # Oder print(df_y) falls kein Jupyter
    except Exception as e:
        print(e)

    # 2. GREEN CHECK
    # Fokus: Haben sie Trip_type und Ehail_fee? (Spezifisch f√ºr Green)
    print("\nüíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):")
    sql_green = f"""
    SELECT 
        source_system, vendor_id, 
        Trip_type, ehail_fee, 
        fare_amount, payment_type, 
        dq_issue_flag
    FROM `{table_ref}`
    WHERE source_system = 'GREEN'
    LIMIT 5
    """
    try:
        df_g = client.query(sql_green).to_dataframe()
        display(df_g)
    except Exception as e:
        print(e)

    # 3. FHV CHECK
    # Fokus: Sind die neuen Base-Nummern da? Sind Preise WIRKLICH NULL?
    print("\nüñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):")
    sql_fhv = f"""
    SELECT 
        source_system, 
        dispatching_base_nummer, 
        Affiliated_base_number, 
        SR_Flag, 
        fare_amount, total_amount, payment_type, -- M√ºssen alle NaN/None sein
        dq_issue_flag
    FROM `{table_ref}`
    WHERE source_system = 'FHV'
    LIMIT 5
    """
    try:
        df_f = client.query(sql_fhv).to_dataframe()
        display(df_f)
    except Exception as e:
        print(e)

    # 4. STATISTIK
    print("\nüìä VERTEILUNG NACH SYSTEM:")
    sql_stats = f"""
    SELECT source_system, COUNT(*) as count 
    FROM `{table_ref}` 
    GROUP BY source_system
    """
    try:
        print(client.query(sql_stats).to_dataframe())
    except:
        pass

check_data_samples()

--- üîç DATA INSPECTION (Tabelle: taxi-bi-project.canonical.canonical_unified_taxi) ---

üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):




Unnamed: 0,source_system,vendor_id,pickup_datetime,RatecodeID,payment_type,fare_amount,tip_amount,total_amount,dq_issue_flag
0,YELLOW,2,2018-07-19 21:50:04+00:00,1,1,25.5,5.36,32.16,False
1,YELLOW,2,2018-07-19 21:29:15+00:00,1,1,4.0,1.06,6.36,False
2,YELLOW,2,2018-07-19 22:17:02+00:00,1,2,18.0,0.0,19.3,False
3,YELLOW,2,2023-12-15 19:59:38+00:00,1,4,0.0,0.0,0.0,False
4,YELLOW,2,2023-12-15 09:03:09+00:00,1,3,0.0,0.0,0.0,False



üíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):




Unnamed: 0,source_system,vendor_id,Trip_type,ehail_fee,fare_amount,payment_type,dq_issue_flag
0,GREEN,1,1.0,0.0,17.0,1,False
1,GREEN,1,1.0,0.0,61.0,1,False
2,GREEN,2,,0.0,36.25,0,False
3,GREEN,2,1.0,0.0,14.5,1,False
4,GREEN,2,1.0,0.0,13.5,1,False



üñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):




Unnamed: 0,source_system,dispatching_base_nummer,Affiliated_base_number,SR_Flag,fare_amount,total_amount,payment_type,dq_issue_flag
0,FHV,B03092,,False,,,0,False
1,FHV,B03081,,False,,,0,False
2,FHV,B03064,,False,,,0,False
3,FHV,B03092,,False,,,0,False
4,FHV,B03092,,False,,,0,False



üìä VERTEILUNG NACH SYSTEM:
  source_system      count
0        YELLOW  105073826
1           FHV  515372815
2         GREEN   67984925


