In [1]:
# Zelle 1: Setup & Config
from google.cloud import bigquery
from google.api_core.exceptions import NotFound

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline


In [2]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
SOURCE_DATASET = "staging"     # <--- Laut deinem Fehlerlog hei√üt es "staging"!
TARGET_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "canonical_unified_taxi"
ERROR_TABLE = "error_records"
LOG_TABLE = "etl_process_log"

client = bigquery.Client(project=PROJECT_ID)

print("--- DIAGNOSE & REPARATUR ---")
try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{SOURCE_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{SOURCE_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{SOURCE_DATASET}' nicht finden!")
    print("Bitte pr√ºfe: Hei√üt dein Dataset in BigQuery wirklich 'staging' oder 'taxi_dwh'?")
    raise e

# 2. Pr√ºfen, ob das Ziel-Dataset 'canonical' falsch liegt
target_dataset_id = f"{PROJECT_ID}.{TARGET_DATASET}"
try:
    tgt_ds = client.get_dataset(target_dataset_id)
    if tgt_ds.location != CORRECT_LOCATION:
        print(f"‚ö†Ô∏è KONFLIKT: Ziel '{TARGET_DATASET}' ist in '{tgt_ds.location}', muss aber nach '{CORRECT_LOCATION}'.")
        print("   L√∂sche falsches Dataset...")
        client.delete_dataset(target_dataset_id, delete_contents=True, not_found_ok=True)
        print("   Gel√∂scht. Wird neu erstellt.")
        tgt_ds = None
except NotFound:
    tgt_ds = None

# 3. Ziel-Dataset korrekt neu erstellen
if not tgt_ds:
    new_ds = bigquery.Dataset(target_dataset_id)
    new_ds.location = CORRECT_LOCATION # <--- Hier zwingen wir die richtige Region!
    client.create_dataset(new_ds)
    print(f"ZIEL ERSTELLT: Dataset '{TARGET_DATASET}' erfolgreich in Region '{CORRECT_LOCATION}' angelegt.")
else:
    print(f"OK: Ziel-Dataset liegt bereits korrekt in '{CORRECT_LOCATION}'.")

# Globale Variablen aktualisieren
table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{TARGET_TABLE}"
error_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{ERROR_TABLE}"
log_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{LOG_TABLE}"

print("\nReady. Bitte jetzt Zelle 3 ausf√ºhren.")

--- DIAGNOSE & REPARATUR ---
‚úÖ QUELLE GEFUNDEN: Dataset 'staging' liegt in Region: 'EU'
OK: Ziel-Dataset liegt bereits korrekt in 'EU'.

Ready. Bitte jetzt Zelle 3 ausf√ºhren.


In [81]:
def check_missing_locations_by_year():
    print("--- üîç DIAGNOSE: Fehlen Location IDs bei alten Yellow Trips? ---")
    
    # Wir gruppieren nach Jahr und z√§hlen, wie viele IDs fehlen
    query = f"""
    SELECT 
        EXTRACT(YEAR FROM pickup_datetime) as year,
        COUNT(*) as total_trips,
        COUNTIF(pickup_location_id IS NULL OR pickup_location_id = 0) as missing_pu_ids,
        COUNTIF(dropoff_location_id IS NULL OR dropoff_location_id = 0) as missing_do_ids,
        
        -- Prozentualer Anteil der Fehler
        ROUND(COUNTIF(pickup_location_id IS NULL OR pickup_location_id = 0) / COUNT(*) * 100, 2) as pct_missing
        
    FROM `{table_ref}`  -- Das ist deine 'canonical_unified_taxi'
    WHERE source_system = 'YELLOW'
    GROUP BY 1
    ORDER BY 1
    """
    
    try:
        df = client.query(query).to_dataframe()
        print(df.to_string(index=False))
        
        # Sofortige Analyse
        if df['pct_missing'].max() > 10:
            print("\n‚ö†Ô∏è ALARM: Wir haben signifikante L√ºcken bei den Location IDs!")
            print("   -> Wahrscheinlich m√ºssen wir Longitude/Latitude mappen.")
        else:
            print("\n‚úÖ Entwarnung: Die Staging-Tabelle scheint das schon bereinigt zu haben.")
            
    except Exception as e:
        print(f"‚ùå Fehler bei der Diagnose: {e}")

check_missing_locations_by_year()

--- üîç DIAGNOSE: Fehlen Location IDs bei alten Yellow Trips? ---
 year  total_trips  missing_pu_ids  missing_do_ids  pct_missing
 2010      4514348               0               0          0.0
 2011      4660462               0               0          0.0
 2012      4669852               0               0          0.0
 2013      4596563               0               0          0.0
 2014      4555520               0               0          0.0
 2015      8659004               0               0          0.0
 2016      8421023               0               0          0.0
 2017      8005526               0               0          0.0
 2018      7655156               0               0          0.0
 2019      6855858               0               0          0.0
 2020       982184               0               0          0.0
 2021      3859939               0               0          0.0
 2022      4530162               0               0          0.0
 2023      4241230               0   



In [36]:
def inspect_staging_columns():
    print("--- üïµÔ∏è‚Äç‚ôÇÔ∏è STAGING INSPECTION (Yellow Taxi) ---")
    
    # Wir schauen uns eine Zeile aus der Staging Tabelle an
    # Stelle sicher, dass der Tabellenname stimmt (yellow_staging_unified)
    table_id = f"{PROJECT_ID}.{SOURCE_DATASET}.yellow_staging_unified"
    
    try:
        # Wir laden nur die Spaltennamen
        df = client.query(f"SELECT * FROM `{table_id}` LIMIT 1").to_dataframe()
        
        print(f"Gefundene Spalten in '{table_id}':")
        cols = sorted(df.columns)
        for c in cols:
            print(f" - {c}")
            
        # Spezifischer Check auf Koordinaten
        print("\nüîç Koordinaten-Check:")
        coord_cols = [c for c in cols if 'lat' in c.lower() or 'lon' in c.lower()]
        if coord_cols:
            print(f"‚úÖ Koordinaten gefunden: {coord_cols}")
        else:
            print("‚ùå Keine direkten Koordinaten-Spalten (Lat/Lon) gefunden.")

    except Exception as e:
        print(f"‚ùå Fehler beim Lesen von Staging: {e}")

inspect_staging_columns()

--- üïµÔ∏è‚Äç‚ôÇÔ∏è STAGING INSPECTION (Yellow Taxi) ---
Gefundene Spalten in 'taxi-bi-project.staging.yellow_staging_unified':
 - Airport_fee
 - DOLocationID
 - PULocationID
 - RatecodeID
 - VendorID
 - congestion_surcharge
 - dropoff_latitude
 - dropoff_longitude
 - duplicate_flag
 - extra
 - fare_amount
 - improvement_surcharge
 - missing_flag
 - mta_tax
 - passenger_count
 - payment_type
 - pickup_latitude
 - pickup_longitude
 - store_and_fwd_flag
 - tip_amount
 - tolls_amount
 - total_amount
 - tpep_dropoff_datetime
 - tpep_pickup_datetime
 - trip_distance

üîç Koordinaten-Check:
‚úÖ Koordinaten gefunden: ['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude', 'pickup_longitude']




In [96]:
# Zelle 2: Schema Definition (Fix: MONTH Partitioning)
def create_all_tables():
    base_schema = [
        bigquery.SchemaField("trip_id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("source_system", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("load_date", "TIMESTAMP", mode="REQUIRED"),
        bigquery.SchemaField("vendor_id", "STRING"), 
        bigquery.SchemaField("Affiliated_base_number", "STRING"),
        bigquery.SchemaField("dispatching_base_nummer", "STRING"),
        bigquery.SchemaField("pickup_datetime", "TIMESTAMP", mode="REQUIRED"),
        bigquery.SchemaField("dropoff_datetime", "TIMESTAMP"),
        bigquery.SchemaField("pickup_location_id", "INT64"), 
        bigquery.SchemaField("dropoff_location_id", "INT64"), 
        bigquery.SchemaField("passenger_count", "INT64"),
        bigquery.SchemaField("trip_distance", "FLOAT64"),
        bigquery.SchemaField("store_and_fwd_flag", "STRING"),
        bigquery.SchemaField("RatecodeID", "INT64"),
        bigquery.SchemaField("Trip_type", "INT64"),
        bigquery.SchemaField("SR_Flag", "BOOLEAN"), 
        bigquery.SchemaField("fare_amount", "FLOAT64"),
        bigquery.SchemaField("tip_amount", "FLOAT64"),
        bigquery.SchemaField("total_amount", "FLOAT64"),
        bigquery.SchemaField("payment_type", "INT64"),
        bigquery.SchemaField("extra", "FLOAT64"),
        bigquery.SchemaField("mta_tax", "FLOAT64"),
        bigquery.SchemaField("tolls_amount", "FLOAT64"),
        bigquery.SchemaField("improvement_surcharge", "FLOAT64"),
        bigquery.SchemaField("congestion_surcharge", "FLOAT64"),
        bigquery.SchemaField("Airport_fee", "FLOAT64"), 
        bigquery.SchemaField("ehail_fee", "FLOAT64"),   
        bigquery.SchemaField("dq_issue_flag", "BOOLEAN") 
    ]

    error_schema = base_schema + [bigquery.SchemaField("rejection_reason", "STRING")]

    # Hier setzen wir explizit MONTH statt DAY
    tables_to_create = [
        (table_ref, base_schema, "pickup_datetime", bigquery.TimePartitioningType.MONTH),
        (error_table_ref, error_schema, "load_date", bigquery.TimePartitioningType.DAY) # Error bleibt DAY, da load_date nur HEUTE ist
    ]

    for t_ref, t_schema, p_field, p_type in tables_to_create:
        try:
            client.delete_table(t_ref, not_found_ok=True)
            t = bigquery.Table(t_ref, schema=t_schema)
            
            t.time_partitioning = bigquery.TimePartitioning(
                field=p_field,
                type_=p_type
            )
            
            t.clustering_fields = ["source_system", "vendor_id"]
            client.create_table(t)
            print(f"‚úÖ Tabelle neu erstellt: {t_ref.split('.')[-1]} (Partition: {p_type})")
        except Exception as e:
            print(f"‚ùå Fehler bei {t_ref}: {e}")

create_all_tables()

‚úÖ Tabelle neu erstellt: canonical_unified_taxi (Partition: MONTH)
‚úÖ Tabelle neu erstellt: error_records (Partition: DAY)


In [98]:
def run_etl_split_logic():
    print(f"üöÄ Starte finale ETL Pipeline f√ºr den Canonical Layer...")
    
    query = f"""
    BEGIN
        -- 1. TRANSFORMATION IN TEMP-SPEICHER
        CREATE OR REPLACE TEMP TABLE temp_trips_processed AS
        WITH combined_raw AS (
            -- (A) YELLOW: 2023 (voll) + Juni (2010-2025)
            SELECT 
                'YELLOW' as src,
                CASE 
                    WHEN UPPER(CAST(VendorID AS STRING)) IN ('CMT', '1') THEN '1'
                    WHEN UPPER(CAST(VendorID AS STRING)) IN ('VTS', '2', 'VERIFONE', 'CURB') THEN '2'
                    WHEN UPPER(CAST(VendorID AS STRING)) = 'DDS' THEN '3' 
                    WHEN CAST(VendorID AS STRING) = '6' THEN '6'
                    WHEN CAST(VendorID AS STRING) = '7' THEN '7'
                    ELSE '99' 
                END as vid,
                COALESCE(SAFE_CAST(tpep_pickup_datetime AS TIMESTAMP), SAFE.PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%S', CAST(tpep_pickup_datetime AS STRING))) as t_pick,
                COALESCE(SAFE_CAST(tpep_dropoff_datetime AS TIMESTAMP), SAFE.PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%S', CAST(tpep_dropoff_datetime AS STRING))) as t_drop,
                COALESCE(CAST(PULocationID AS INT64), (SELECT ANY_VALUE(location_id) FROM `{PROJECT_ID}.{SOURCE_DATASET}.taxi_zones_geo` WHERE ST_WITHIN(SAFE.ST_GEOGPOINT(pickup_longitude, pickup_latitude), zone_geom)), 263) as loc_pu,
                COALESCE(CAST(DOLocationID AS INT64), (SELECT ANY_VALUE(location_id) FROM `{PROJECT_ID}.{SOURCE_DATASET}.taxi_zones_geo` WHERE ST_WITHIN(SAFE.ST_GEOGPOINT(dropoff_longitude, dropoff_latitude), zone_geom)), 263) as loc_do,
                IFNULL(CAST(passenger_count AS INT64), 1) as pax, 
                CAST(trip_distance AS FLOAT64) as dist, 
                IFNULL(CAST(store_and_fwd_flag AS STRING), 'N') as flag,
                COALESCE(SAFE_CAST(RatecodeID AS INT64), 99) as rate,
                1 as t_type, 
                CAST(NULL AS BOOL) as sr_flag,
                GREATEST(IFNULL(CAST(fare_amount AS FLOAT64), 0), 0) as f_amt,
                GREATEST(IFNULL(CAST(tip_amount AS FLOAT64), 0), 0) as t_amt,
                GREATEST(IFNULL(CAST(total_amount AS FLOAT64), 0), 0) as tot_amt,
                CASE 
                    WHEN LOWER(CAST(payment_type AS STRING)) IN ('cre', 'credit', '1') THEN 1 
                    WHEN LOWER(CAST(payment_type AS STRING)) IN ('cas', 'cash', '2') THEN 2 
                    WHEN LOWER(CAST(payment_type AS STRING)) IN ('no', '3') THEN 3
                    ELSE 5 
                END as pay,
                GREATEST(IFNULL(CAST(extra AS FLOAT64), 0), 0) as ex,
                GREATEST(IFNULL(CAST(mta_tax AS FLOAT64), 0), 0) as mt, 
                GREATEST(IFNULL(CAST(tolls_amount AS FLOAT64), 0), 0) as tl,
                GREATEST(IFNULL(CAST(improvement_surcharge AS FLOAT64), 0), 0) as im, 
                GREATEST(IFNULL(CAST(congestion_surcharge AS FLOAT64), 0), 0) as co,
                GREATEST(IFNULL(CAST(Airport_fee AS FLOAT64), 0), 0) as ai, 
                CAST(NULL AS FLOAT64) as eh, 
                CAST(NULL AS STRING) as aff, 
                CAST(NULL AS STRING) as disp
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.yellow_staging_unified`
            WHERE (EXTRACT(YEAR FROM tpep_pickup_datetime) = 2023) 
               OR (EXTRACT(MONTH FROM tpep_pickup_datetime) = 6)

            UNION ALL

           -- (B) GREEN: Mit Rescue-Logik f√ºr 2018 (Hier ist dein angepasster Block!)
            SELECT 
                'GREEN' as src, 
                -- VENDOR MAPPING (Identisch zu Yellow f√ºr volle Konsistenz)
                CASE 
                    WHEN UPPER(CAST(VendorID AS STRING)) IN ('CMT', '1') THEN '1'
                    WHEN UPPER(CAST(VendorID AS STRING)) IN ('VTS', '2', 'VERIFONE', 'CURB') THEN '2'
                    ELSE CAST(VendorID AS STRING) 
                END as vid,
                CAST(lpep_pickup_datetime AS TIMESTAMP) as t_pick, 
                CAST(lpep_dropoff_datetime AS TIMESTAMP) as t_drop,
                CAST(IFNULL(PULocationID, 263) AS INT64) as loc_pu, 
                CAST(IFNULL(DOLocationID, 263) AS INT64) as loc_do,
                CAST(passenger_count AS INT64) as pax, 
                CAST(trip_distance AS FLOAT64) as dist, 
                store_and_fwd_flag as flag,
                CAST(RatecodeID AS INT64) as rate, 
                CAST(trip_type AS INT64) as t_type, 
                -- SR_FLAG FIX: Green ist kein App-Pooler, also FALSE statt NULL
                FALSE as sr_flag,
                fare_amount as f_amt, 
                tip_amount as t_amt, 
                total_amount as tot_amt, 
                -- DEINE RESCUE LOGIK (Bleibt nat√ºrlich drin!)
                CASE 
                    WHEN payment_type IS NOT NULL THEN CAST(ROUND(SAFE_CAST(payment_type AS FLOAT64)) AS INT64)
                    WHEN payment_type IS NULL AND fare_amount > 0 THEN 5 
                    ELSE 0 
                END AS pay,
                extra as ex, 
                mta_tax as mt, 
                tolls_amount as tl, 
                improvement_surcharge as im, 
                congestion_surcharge as co, 
                0.0 as ai, 
                ehail_fee as eh, 
                CAST(NULL AS STRING) as aff, 
                CAST(NULL AS STRING) as disp
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.green_staging_unified`
            WHERE EXTRACT(YEAR FROM lpep_pickup_datetime) >= 2015

            UNION ALL

            -- (C) FHV
            SELECT 
                'FHV', 
                CAST(dispatching_base_num AS STRING) as vid, 
                CAST(pickup_datetime AS TIMESTAMP), CAST(dropOff_datetime AS TIMESTAMP),
                CAST(IFNULL(PULocationID, 263) AS INT64), CAST(IFNULL(DOLocationID, 263) AS INT64),
                NULL, NULL, 'N', 99, 2, 
                CASE 
                    WHEN CAST(SR_Flag AS STRING) = '1' THEN TRUE 
                    ELSE FALSE 
                END as sr_flag,
                NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, Affiliated_base_number, dispatching_base_num
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.fhv_staging_unified`
            WHERE EXTRACT(YEAR FROM pickup_datetime) >= 2015
        )
        SELECT 
            *,
            CASE 
                WHEN t_pick IS NULL OR t_drop IS NULL THEN 'Incorrect: Missing Timestamps'
                WHEN t_pick >= t_drop THEN 'Incorrect: Invalid Duration'
                WHEN t_pick > CURRENT_TIMESTAMP() THEN 'Incorrect: Future Date'
                WHEN src IN ('YELLOW', 'GREEN') AND (tot_amt <= 0 OR f_amt <= 0) THEN 'Incorrect: Financials'
                WHEN pax < 1 OR pax > 6 THEN 'Incorrect: Invalid Pax Count (Rule 1.4)'
                WHEN dist < 0 OR dist >= 1000 THEN 'Incorrect: Invalid Distance (Rule 1.5)'
                ELSE 'VALID'
            END as row_status,
            CASE WHEN dist > 500 OR (pay = 2 AND t_amt = 0) THEN TRUE ELSE FALSE END as dq_issue_flag
        FROM combined_raw
        QUALIFY ROW_NUMBER() OVER (
                PARTITION BY 
                    src, 
                    t_pick, 
                    t_drop, 
                    loc_pu, 
                    loc_do, 
                    vid 
                ORDER BY t_pick
            ) = 1;

        -- 2. INSERT VALID DATA IN CANONICAL (Reihenfolge gem√§√ü Zelle 2)
        INSERT INTO `{table_ref}` (
            trip_id, source_system, load_date, vendor_id, 
            Affiliated_base_number, dispatching_base_nummer, 
            pickup_datetime, dropoff_datetime, pickup_location_id, dropoff_location_id, 
            passenger_count, trip_distance, store_and_fwd_flag, 
            RatecodeID, Trip_type, SR_Flag, fare_amount, tip_amount, total_amount, 
            payment_type, extra, mta_tax, tolls_amount, improvement_surcharge, 
            congestion_surcharge, Airport_fee, ehail_fee, dq_issue_flag
        )
        SELECT 
            CAST(FARM_FINGERPRINT(CONCAT(src, CAST(t_pick AS STRING), IFNULL(vid,''))) AS STRING),
            src, CURRENT_TIMESTAMP(), vid, aff, disp, t_pick, t_drop, loc_pu, loc_do, 
            pax, dist, flag, rate, t_type, sr_flag, f_amt, t_amt, tot_amt, 
            pay, ex, mt, tl, im, co, ai, eh, dq_issue_flag
        FROM temp_trips_processed WHERE row_status = 'VALID';

        -- 3. INSERT ERROR DATA (Mit rejection_reason)
        INSERT INTO `{error_table_ref}` (
            trip_id, source_system, load_date, vendor_id, 
            Affiliated_base_number, dispatching_base_nummer, 
            pickup_datetime, dropoff_datetime, pickup_location_id, dropoff_location_id, 
            passenger_count, trip_distance, store_and_fwd_flag, 
            RatecodeID, Trip_type, SR_Flag, fare_amount, tip_amount, total_amount, 
            payment_type, extra, mta_tax, tolls_amount, improvement_surcharge, 
            congestion_surcharge, Airport_fee, ehail_fee, dq_issue_flag, rejection_reason
        )
        SELECT 
            CAST(FARM_FINGERPRINT(CONCAT(src, CAST(t_pick AS STRING), IFNULL(vid,''))) AS STRING),
            src, CURRENT_TIMESTAMP(), vid, aff, disp, t_pick, t_drop, loc_pu, loc_do, 
            pax, dist, flag, rate, t_type, sr_flag, f_amt, t_amt, tot_amt, 
            pay, ex, mt, tl, im, co, ai, eh, TRUE, row_status
        FROM temp_trips_processed WHERE row_status != 'VALID';
    END;
    """
    
    try:
        client.query(query).result()
        print("‚úÖ ETL Job erfolgreich abgeschlossen.")
        print("   - Syntax-Fehler (Semikolon) behoben.")
        print("   - Spalten-Alignment f√ºr INSERT sichergestellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

run_etl_split_logic()

üöÄ Starte finale ETL Pipeline f√ºr den Canonical Layer...
‚úÖ ETL Job erfolgreich abgeschlossen.
   - Syntax-Fehler (Semikolon) behoben.
   - Spalten-Alignment f√ºr INSERT sichergestellt.


In [94]:
# Zelle 4: Quality Check
print("--- CLEAN DATA SAMPLE ---")
print(client.query(f"SELECT EXTRACT(MONTH FROM pickup_datetime) as monat, COUNT(*) as anzahl, COUNTIF(pickup_location_id IS NOT NULL AND pickup_location_id != 263) as erfolgreich_gemappt, COUNTIF(pickup_location_id = 263) as unbekannte_zone FROM `{table_ref}` WHERE EXTRACT(YEAR FROM pickup_datetime) = 2010 GROUP BY 1;").to_dataframe().T)

print("\n--- ERROR DATA SAMPLE  ---")
try:
    err_df = client.query(f"SELECT rejection_reason, count(*) as cnt FROM `{error_table_ref}` GROUP BY 1").to_dataframe()
    print(err_df)
except:
    print("Keine Fehler gefunden (Tabelle leer).")

--- CLEAN DATA SAMPLE ---




                           0
monat                      6
anzahl               2812445
erfolgreich_gemappt  2693809
unbekannte_zone       118636

--- ERROR DATA SAMPLE  ---
                          rejection_reason        cnt
0                    Incorrect: Financials     588188
1   Incorrect: Invalid Distance (Rule 1.5)       9646
2  Incorrect: Invalid Pax Count (Rule 1.4)     879413
3                   Incorrect: Future Date          8
4              Incorrect: Invalid Duration  201180289




In [99]:
def run_data_audit():
    print("üìä Starte Daten-Audit (Staging vs. Canonical vs. Error)...")
    
    # Query f√ºr die Rohdaten (Staging) mit deinen Filtern
    query_staging = f"""
    SELECT 'YELLOW' as src, COUNT(*) as anzahl FROM `{PROJECT_ID}.{SOURCE_DATASET}.yellow_staging_unified` 
    WHERE (EXTRACT(YEAR FROM tpep_pickup_datetime) = 2023) OR (EXTRACT(MONTH FROM tpep_pickup_datetime) = 6)
    UNION ALL
    SELECT 'GREEN', COUNT(*) FROM `{PROJECT_ID}.{SOURCE_DATASET}.green_staging_unified` WHERE EXTRACT(YEAR FROM lpep_pickup_datetime) >= 2015
    UNION ALL
    SELECT 'FHV', COUNT(*) FROM `{PROJECT_ID}.{SOURCE_DATASET}.fhv_staging_unified` WHERE EXTRACT(YEAR FROM pickup_datetime) >= 2015
    """

    # Query f√ºr das Ziel (Canonical + Error)
    query_target = f"""
    SELECT source_system as src, 'VALID' as cat, COUNT(*) as anzahl FROM `{table_ref}` GROUP BY 1
    UNION ALL
    SELECT source_system, 'ERROR', COUNT(*) FROM `{error_table_ref}` GROUP BY 1
    """

    try:
        df_staging = client.query(query_staging).to_dataframe()
        df_target = client.query(query_target).to_dataframe()

        print("\n--- 1. ROHDATEN IN STAGING (gefiltert) ---")
        print(df_staging)
        total_raw = df_staging['anzahl'].sum()
        print(f"Gesamt Rohdaten: {total_raw:,}")

        print("\n--- 2. VERTEILUNG IM CANONICAL LAYER ---")
        print(df_target)
        total_target = df_target['anzahl'].sum()
        print(f"Gesamt (Valid + Error): {total_target:,}")

        diff = total_raw - total_target
        print("\n--- ANALYSE ---")
        print(f"Differenz: {diff:,} Zeilen")
        if diff > 0:
            print(f"‚ÑπÔ∏è Hinweis: Diese {diff:,} Zeilen wurden als DUPLIKATE entfernt (Qualify-Regel).")
        elif diff == 0:
            print("‚úÖ Perfekt: Jede Zeile wurde entweder als VALID oder ERROR verarbeitet.")
            
    except Exception as e:
        print(f"‚ùå Fehler beim Audit: {e}")

run_data_audit()

üìä Starte Daten-Audit (Staging vs. Canonical vs. Error)...





--- 1. ROHDATEN IN STAGING (gefiltert) ---
      src     anzahl
0     FHV  783688849
1   GREEN   68044817
2  YELLOW  178437711
Gesamt Rohdaten: 1,030,171,377

--- 2. VERTEILUNG IM CANONICAL LAYER ---
      src    cat     anzahl
0  YELLOW  ERROR    2131071
1   GREEN  ERROR     414561
2   GREEN  VALID   67447832
3     FHV  VALID  514630379
4     FHV  ERROR  240175710
5  YELLOW  VALID  172066569
Gesamt (Valid + Error): 996,866,122

--- ANALYSE ---
Differenz: 33,305,255 Zeilen
‚ÑπÔ∏è Hinweis: Diese 33,305,255 Zeilen wurden als DUPLIKATE entfernt (Qualify-Regel).


In [89]:
# Zelle 5: Mapping-Check f√ºr Ratecode und Payment Type (Fokus 2010)
def check_rate_and_payment():
    print("üîç Analysiere Ratecode und Payment Type Mapping f√ºr 2010...")

    # Query f√ºr Ratecode Verteilung
    query_ratecode = f"""
    SELECT 
        RatecodeID, 
        COUNT(*) as anzahl
    FROM `{table_ref}`
    WHERE EXTRACT(YEAR FROM pickup_datetime) = 2010
    GROUP BY 1
    ORDER BY anzahl DESC
    """

    # Query f√ºr Payment Type Verteilung
    query_payment = f"""
    SELECT 
        payment_type, 
        COUNT(*) as anzahl
    FROM `{table_ref}`
    WHERE EXTRACT(YEAR FROM pickup_datetime) = 2010
    GROUP BY 1
    ORDER BY anzahl DESC
    """

    try:
        print("\n--- 1. Verteilung RatecodeID (Ziel: 1=Standard, 2=JFK, etc.) ---")
        df_rate = client.query(query_ratecode).to_dataframe()
        # Mapping Label f√ºr die Anzeige
        rate_map = {1: "Standard", 2: "JFK", 3: "Newark", 4: "Nassau/Westch.", 5: "Neg.", 6: "Group", 99: "Unknown"}
        df_rate['Bezeichnung'] = df_rate['RatecodeID'].map(rate_map)
        print(df_rate)

        print("\n--- 2. Verteilung Payment Type (Ziel: 1=Card, 2=Cash, etc.) ---")
        df_pay = client.query(query_payment).to_dataframe()
        # Mapping Label f√ºr die Anzeige
        pay_map = {1: "Card", 2: "Cash", 3: "No Charge", 4: "Dispute", 5: "Unknown", 6: "Void"}
        df_pay['Bezeichnung'] = df_pay['payment_type'].map(pay_map)
        print(df_pay)

        # Erfolgskontrolle
        if not df_rate.empty and all(isinstance(x, (int, float)) for x in df_rate['RatecodeID'].dropna()):
            print("\n‚úÖ SUCCESS: RatecodeID wurde erfolgreich in numerische Werte transformiert.")
        if not df_pay.empty and all(isinstance(x, (int, float)) for x in df_pay['payment_type'].dropna()):
            print("‚úÖ SUCCESS: payment_type wurde erfolgreich in numerische Werte transformiert.")

    except Exception as e:
        print(f"‚ùå Fehler bei der Analyse: {e}")

check_rate_and_payment()

üîç Analysiere Ratecode und Payment Type Mapping f√ºr 2010...

--- 1. Verteilung RatecodeID (Ziel: 1=Standard, 2=JFK, etc.) ---




   RatecodeID   anzahl     Bezeichnung
0           1  5473049        Standard
1           2    96217             JFK
2           0    21649             NaN
3           4    13517  Nassau/Westch.
4           5    10195            Neg.
5           3     8548          Newark
6           6      139           Group
7         210       10             NaN
8          65        2             NaN
9           7        1             NaN

--- 2. Verteilung Payment Type (Ziel: 1=Card, 2=Cash, etc.) ---
   payment_type   anzahl Bezeichnung
0             2  3613908        Cash
1             1  1982293        Card
2             5    27126     Unknown




In [48]:
from google.cloud import bigquery

client = bigquery.Client()

# Wir pr√ºfen die Payment Types in den alten 2010er Daten und den neuen 2023er Daten
query = """
SELECT 
    '2010 (Schema 5)' as jahr,
    CAST(payment_type AS STRING) as raw_value, 
    COUNT(*) as anzahl
FROM `taxi-bi-project.staging.yellow_schema_5`
GROUP BY 1, 2

UNION ALL

SELECT 
    '2023 (Modern)' as jahr,
    CAST(payment_type AS STRING) as raw_value, 
    COUNT(*) as anzahl
FROM `taxi-bi-project.staging.yellow_staging_unified`
WHERE EXTRACT(YEAR FROM tpep_pickup_datetime) = 2023
GROUP BY 1, 2
ORDER BY jahr, anzahl DESC
"""

df_payments = client.query(query).to_dataframe()
print("--- Analyse der Payment Types (Rohdaten) ---")
print(df_payments)

--- Analyse der Payment Types (Rohdaten) ---
               jahr raw_value    anzahl
0   2010 (Schema 5)       Cre   5168999
1   2010 (Schema 5)       CAS   5164878
2   2010 (Schema 5)       Cas   4227711
3   2010 (Schema 5)       CRE    225832
4   2010 (Schema 5)       No      31091
5   2010 (Schema 5)       Dis      6617
6     2023 (Modern)         1  32449126
7     2023 (Modern)         2   6957625
8     2023 (Modern)         0   1409243
9     2023 (Modern)         4    539200
10    2023 (Modern)         3    262154
11    2023 (Modern)         5         3




In [102]:
query_date_check = """
SELECT 
    pickup_datetime as raw_string,
    COUNT(*) as anzahl
FROM `taxi-bi-project.staging.yellow_schema_5`
GROUP BY 1
LIMIT 5
"""
print(client.query(query_date_check).to_dataframe())

            raw_string  anzahl
0  2010-06-21 10:46:30       2
1  2010-06-11 03:03:14       1
2  2010-06-19 18:07:47       4
3  2010-06-30 06:30:51       3
4  2010-06-28 00:04:25       2




In [103]:
# Zelle 5: Data Quality Check (Stichproben pro System)
import pandas as pd

def check_data_samples():
    print(f"--- üîç DATA INSPECTION (Tabelle: {table_ref}) ---\n")
    
    # 1. YELLOW CHECK ("Cellos")
    # Fokus: Haben sie Ratecode? Sind Finanzen da?
    print("üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):")
    sql_yellow = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'YELLOW'
    LIMIT 15
    """
    try:
        df_y = client.query(sql_yellow).to_dataframe()
        display(df_y) # Oder print(df_y) falls kein Jupyter
    except Exception as e:
        print(e)

    # 2. GREEN CHECK
    # Fokus: Haben sie Trip_type und Ehail_fee? (Spezifisch f√ºr Green)
    print("\nüíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):")
    sql_green = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'GREEN'
    LIMIT 5
    """
    try:
        df_g = client.query(sql_green).to_dataframe()
        display(df_g)
    except Exception as e:
        print(e)

    # 3. FHV CHECK
    # Fokus: Sind die neuen Base-Nummern da? Sind Preise WIRKLICH NULL?
    print("\nüñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):")
    sql_fhv = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'FHV'
    LIMIT 5
    """
    try:
        df_f = client.query(sql_fhv).to_dataframe()
        display(df_f)
    except Exception as e:
        print(e)

    # 4. STATISTIK
    print("\nüìä VERTEILUNG NACH SYSTEM:")
    sql_stats = f"""
    SELECT source_system, COUNT(*) as count 
    FROM `{table_ref}` 
    GROUP BY source_system
    """
    try:
        print(client.query(sql_stats).to_dataframe())
    except:
        pass

check_data_samples()

--- üîç DATA INSPECTION (Tabelle: taxi-bi-project.canonical.canonical_unified_taxi) ---

üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,...,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,1531786308848340242,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-18 16:07:19+00:00,2010-06-18 16:07:48+00:00,229,229,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
1,3607530224108402529,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-06 08:36:37+00:00,2010-06-06 08:41:16+00:00,129,129,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
2,1695272938504091367,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-08 15:24:19+00:00,2010-06-08 16:44:03+00:00,263,263,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
3,-4841951142003044862,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-19 07:51:49+00:00,2010-06-19 08:03:45+00:00,50,50,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
4,7494652164461089866,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-20 09:32:19+00:00,2010-06-20 09:33:19+00:00,39,39,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
5,2238987991992236317,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-22 14:28:31+00:00,2010-06-22 14:29:11+00:00,88,87,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
6,4170712180169412968,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-08 09:50:24+00:00,2010-06-08 09:50:39+00:00,138,138,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
7,4557762523158386914,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-19 11:48:56+00:00,2010-06-19 11:49:48+00:00,186,186,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
8,6618239509873616713,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-23 11:30:28+00:00,2010-06-23 14:41:29+00:00,28,216,...,2.5,2,0.0,0.0,0.0,0.0,0.0,0.0,,True
9,352918484745089667,YELLOW,2026-01-08 12:42:12.836802+00:00,3,,,2010-06-20 11:21:42+00:00,2010-06-20 11:23:16+00:00,237,237,...,2.9,2,0.0,0.0,0.0,0.0,0.0,0.0,,True



üíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,...,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,8934663925888110030,GREEN,2026-01-08 12:42:12.836802+00:00,1,,,2017-09-24 00:32:37+00:00,2017-09-24 00:33:12+00:00,255,255,...,0.01,2,0.0,0.0,0.0,0.0,,0.0,,True
1,4458259119911185826,GREEN,2026-01-08 12:42:12.836802+00:00,1,,,2017-09-23 05:47:17+00:00,2017-09-23 05:48:01+00:00,255,255,...,0.01,3,0.0,0.0,0.0,0.0,,0.0,,False
2,2082099856174231571,GREEN,2026-01-08 12:42:12.836802+00:00,1,,,2017-09-21 01:28:05+00:00,2017-09-21 01:28:31+00:00,235,235,...,0.01,2,0.0,0.0,0.0,0.0,,0.0,,True
3,6159180755314097705,GREEN,2026-01-08 12:42:12.836802+00:00,1,,,2017-09-13 05:19:08+00:00,2017-09-13 05:22:16+00:00,243,243,...,0.01,2,0.0,0.0,0.0,0.0,,0.0,,True
4,-2078796549061550083,GREEN,2026-01-08 12:42:12.836802+00:00,1,,,2017-09-20 11:18:38+00:00,2017-09-20 11:19:23+00:00,36,36,...,0.31,3,0.0,0.0,0.0,0.3,,0.0,,False



üñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,...,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,-6389995901186689913,FHV,2026-01-08 12:42:12.836802+00:00,B02127,B02127,B02127,2015-09-05 18:30:00+00:00,2018-09-05 19:30:00+00:00,263,263,...,,0,,,,,,,,False
1,8505448901266430594,FHV,2026-01-08 12:42:12.836802+00:00,B02653,B02653,B02653,2021-10-29 05:00:12+00:00,2021-10-29 05:39:17+00:00,263,1,...,,0,,,,,,,,False
2,-5090010053490920428,FHV,2026-01-08 12:42:12.836802+00:00,B02657,B02657,B02657,2021-10-04 12:51:44+00:00,2021-10-04 12:51:48+00:00,263,1,...,,0,,,,,,,,False
3,2532296753196713593,FHV,2026-01-08 12:42:12.836802+00:00,B02661,B02661,B02661,2021-10-25 05:30:00+00:00,2021-10-25 06:13:00+00:00,84,1,...,,0,,,,,,,,False
4,6238535805041828204,FHV,2026-01-08 12:42:12.836802+00:00,B02661,B02661,B02661,2021-10-23 05:30:00+00:00,2021-10-23 06:07:00+00:00,206,1,...,,0,,,,,,,,False



üìä VERTEILUNG NACH SYSTEM:




  source_system      count
0        YELLOW  172066569
1           FHV  514630379
2         GREEN   67447832


In [34]:
import geopandas as gpd
from google.cloud import bigquery

client = bigquery.Client()

# 1. GeoJSON mit allen Spalten laden
geojson_path = '/Users/lania/Documents/GitHub/bi_project_task2/notebook_canonical/NYC_Taxi_Zones.geojson'
gdf_zones = gpd.read_file(geojson_path)

# 2. Alle wichtigen Spalten vorbereiten
# Wir brauchen: location_id, zone, borough und die Geometrie als WKT
gdf_zones['WKT_GEOMETRY'] = gdf_zones['geometry'].apply(lambda x: x.wkt)
df_to_upload = gdf_zones[['location_id', 'zone', 'borough', 'WKT_GEOMETRY']].copy()

# 3. Hochladen nach BigQuery (Temp-Tabelle)
df_to_upload.to_gbq('staging.taxi_zones_temp', project_id='taxi-bi-project', if_exists='replace')

# 4. Die finale Geo-Tabelle mit ALLEN Spalten erstellen
fix_query = """
CREATE OR REPLACE TABLE `taxi-bi-project.staging.taxi_zones_geo` AS
SELECT 
    CAST(location_id AS INT64) AS location_id,
    zone,
    borough,
    ST_GEOGFROMTEXT(WKT_GEOMETRY) AS zone_geom
FROM `taxi-bi-project.staging.taxi_zones_temp`;
"""
client.query(fix_query, location="EU").result()

print("‚úÖ Tabelle taxi_zones_geo wurde mit 'borough' und 'zone' neu erstellt!")

  df_to_upload.to_gbq('staging.taxi_zones_temp', project_id='taxi-bi-project', if_exists='replace')
263 out of 263 rows loaded.


‚úÖ Tabelle taxi_zones_geo wurde mit 'borough' und 'zone' neu erstellt!


In [None]:
# Bezirke in NYV
from google.cloud import bigquery
import pandas as pd

client = bigquery.Client()

query = """
SELECT 
    z.borough AS pickup_borough,
    COUNT(*) AS total_trips,
    ROUND(AVG(fare_amount), 2) AS avg_fare_usd,
    ROUND(SUM(total_amount) / 1000000, 2) AS total_revenue_million_usd
FROM 
    `taxi-bi-project.canonical.canonical_unified_taxi` AS t
LEFT JOIN 
    `taxi-bi-project.staging.taxi_zones_geo` AS z 
    ON t.pickup_location_id = z.location_id
WHERE 
    EXTRACT(YEAR FROM t.pickup_datetime) = 2010
GROUP BY 
    1
ORDER BY 
    total_trips DESC
"""

try:
    df_2010 = client.query(query, location="EU").to_dataframe()
    print("\n--- Top Stadtbezirke im Juni 2010 ---")
    print(df_2010.to_string(index=False))
except Exception as e:
    print(f"‚ùå Fehler: {e}")


--- Top Stadtbezirke im Juni 2010 ---
pickup_borough  total_trips  avg_fare_usd  total_revenue_million_usd
     Manhattan     13676019          9.30                     149.85
        Queens       723918         24.90                      21.41
      Brooklyn       271449         12.14                       3.81
         Bronx        15879         11.28                       0.21
           EWR         1309         61.50                       0.10
 Staten Island          910         12.68                       0.01




In [None]:
#Check Vendors in Yellow 
from google.cloud import bigquery

client = bigquery.Client()

# Wir fragen direkt die Tabelle yellow_schema_5 ab
# Ich nutze hier 'vendor_id', da dies der Standardname in Schema 5 ist.
# Falls es wirklich 'vendor_id' geschrieben wird, passen wir es an.
query = """
SELECT 
    vendor_id, 
    COUNT(*) as anzahl
FROM `taxi-bi-project.staging.yellow_schema_5`
GROUP BY 1
ORDER BY anzahl DESC
"""

print("Abfrage der Original-VendorIDs aus yellow_schema_5...")

try:
    df_vendor_5 = client.query(query, location="EU").to_dataframe()
    print("\n--- Gefundene Anbieter in Schema 5 (2010) ---")
    print(df_vendor_5)
except Exception as e:
    print(f"‚ùå Fehler: {e}")
    print("\nHinweis: Falls die Spalte anders geschrieben wird (z.B. vendor_name), versuche die Spaltenliste anzuzeigen.")

Abfrage der Original-VendorIDs aus yellow_schema_5...





--- Gefundene Anbieter in Schema 5 (2010) ---
  vendor_id   anzahl
0       VTS  7471719
1       CMT  6670024
2       DDS   683385


In [100]:
# Debug Query: Woher kommen die 402 Millionen Fehler?
query_debug = f"""
SELECT 
    source_system, 
    EXTRACT(YEAR FROM pickup_datetime) as year,
    rejection_reason, 
    COUNT(*) as cnt
FROM `{error_table_ref}`
GROUP BY 1, 2, 3
ORDER BY cnt DESC
LIMIT 20
"""
df_debug = client.query(query_debug).to_dataframe()
print(df_debug)



   source_system  year                         rejection_reason        cnt
0            FHV  2016              Incorrect: Invalid Duration  117677227
1            FHV  2017              Incorrect: Invalid Duration   67187994
2            FHV  2015              Incorrect: Invalid Duration   55302393
3         YELLOW  2023  Incorrect: Invalid Pax Count (Rule 1.4)     581618
4         YELLOW  2025                    Incorrect: Financials     244258
5         YELLOW  2011  Incorrect: Invalid Pax Count (Rule 1.4)     238271
6         YELLOW  2023                    Incorrect: Financials     225506
7         YELLOW  2019  Incorrect: Invalid Pax Count (Rule 1.4)     127818
8          GREEN  2015                    Incorrect: Financials      77079
9         YELLOW  2018  Incorrect: Invalid Pax Count (Rule 1.4)      70559
10        YELLOW  2025              Incorrect: Invalid Duration      68579
11        YELLOW  2022  Incorrect: Invalid Pax Count (Rule 1.4)      68503
12        YELLOW  2021  I