In [1]:

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline

# Lesbarkeit in der Exploration erh√∂hen
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 140)

In [2]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
DIM_DATASET = "dimensional"
SOURCE_DATASET = "staging"      
CAN_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "dimensional"

client = bigquery.Client(project=PROJECT_ID)

try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{DIM_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{DIM_DATASET}' nicht finden!")
    raise e

‚úÖ QUELLE GEFUNDEN: Dataset 'dimensional' liegt in Region: 'EU'


In [None]:
# Zelle 4: Dimensional Layer (Komplett mit Smart-Base-Logik)
def create_dimensional_layer_advanced():
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Advanced Layer 3 in '{DIM_DATASET}' ---")

    # Dataset erstellen
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)

    # 1. DIM: Payment & RateCode & TripType (Die einfachen Labels)
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type` AS SELECT * FROM UNNEST([STRUCT(1 as payment_type_id, 'Credit Card' as payment_description), STRUCT(2, 'Cash'), STRUCT(0, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code` AS SELECT * FROM UNNEST([STRUCT(1 as rate_code_id, 'Standard' as rate_description), STRUCT(2, 'JFK'), STRUCT(99, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type` AS SELECT * FROM UNNEST([STRUCT(1 as trip_type_id, 'Street-hail' as trip_description), STRUCT(2, 'Dispatch')])", location=CORRECT_LOCATION).result()

    # 2. DIM: VENDOR 
    print("Erstelle dim_vendor mit Fallback-Logik f√ºr alte Basen...")
    
    sql_vendor_smart = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_vendor`
    (vendor_id STRING, vendor_name STRING, source_system STRING, is_active BOOLEAN)
    AS
    WITH all_existing_bases AS (
        -- 1. Wir schauen in die Vergangenheit: Wer ist alles gefahren?
        SELECT DISTINCT dispatching_base_nummer as vid FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
        WHERE dispatching_base_nummer IS NOT NULL
    ),
    active_bases_list AS (
        -- 2. Deine hochgeladene Datei (Nur die Aktiven)
        -- PASS AUF DIE SPALTEN-NAMEN AUF! (Hier: license_number, name)
        SELECT base_num, base_name FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_fhv_bases`
    )
    
    SELECT 
        hist.vid as vendor_id,
        -- Hier ist der "Smart Logic" Teil:
        COALESCE(act.base_name, CONCAT('Inactive/Unknown Base ', hist.vid)) as vendor_name,
        'FHV' as source_system,
        IF(act.base_name IS NOT NULL, TRUE, FALSE) as is_active
    FROM all_existing_bases hist
    LEFT JOIN active_bases_list act ON hist.vid = act.base_num
    
    UNION ALL
    
    -- Nicht vergessen: Die Yellow/Green Taxis
    SELECT '1', 'Creative Mobile Technologies (CMT)', 'YELLOW/GREEN', TRUE
    UNION ALL
    SELECT '2', 'VeriFone Inc. (Curb)', 'YELLOW/GREEN', TRUE
    """
    
    try:
        client.query(sql_vendor_smart, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_vendor erstellt (Alte Basen wurden erhalten).")
    except Exception as e:
        print(f"‚ö†Ô∏è Fehler bei dim_vendor (Hast du die Datei hochgeladen?): {e}")

    # 3. DIM: LOCATION (Optional - wenn du die Datei hast)
    # Wenn du 'staging_taxi_zones' hochgeladen hast, nimm diesen Block:
    """
    sql_location = f'''
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location` AS
    SELECT 
        LocationID, 
        Borough, 
        Zone, 
        service_zone 
    FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_taxi_zones`
    '''
    client.query(sql_location, location=CORRECT_LOCATION).result()
    """

create_dimensional_layer_advanced()

In [25]:
def create_dim_date():
    DIM_DATASET = "dimensional"
    dim_table_id = f"{PROJECT_ID}.{DIM_DATASET}.dim_date"
    
    print(f"--- Erstelle Time Dimension: {dim_table_id} ---")
    
    # Wir generieren Daten von 2015 bis Ende 2026
    sql_date = f"""
    CREATE OR REPLACE TABLE `{dim_table_id}`
    AS
    SELECT
        d AS date_key,  -- Primary Key f√ºr Joins
        EXTRACT(YEAR FROM d) AS year,
        EXTRACT(MONTH FROM d) AS month,
        FORMAT_DATE('%B', d) AS month_name,  -- z.B. January
        EXTRACT(WEEK FROM d) AS week_of_year,
        EXTRACT(DAYOFWEEK FROM d) AS day_of_week_num, -- 1=Sunday in US, checke BI Tool Setting
        FORMAT_DATE('%A', d) AS day_name,    -- z.B. Monday
        EXTRACT(QUARTER FROM d) AS quarter,
        
        -- N√ºtzliche Flags f√ºr BI
        CASE WHEN EXTRACT(DAYOFWEEK FROM d) IN (1, 7) THEN TRUE ELSE FALSE END AS is_weekend,
        
        -- Optional: Simple Saison-Logik
        CASE 
            WHEN EXTRACT(MONTH FROM d) IN (12, 1, 2) THEN 'Winter'
            WHEN EXTRACT(MONTH FROM d) IN (3, 4, 5) THEN 'Spring'
            WHEN EXTRACT(MONTH FROM d) IN (6, 7, 8) THEN 'Summer'
            ELSE 'Fall' 
        END AS season
        
    FROM UNNEST(GENERATE_DATE_ARRAY('2015-01-01', '2026-12-31')) AS d
    ORDER BY d
    """
    
    try:
        client.query(sql_date).result()
        print("‚úÖ dim_date erfolgreich erstellt (2015-2026).")
    except Exception as e:
        print(f"‚ùå Fehler bei dim_date: {e}")

# Ausf√ºhren
create_dim_date()

--- Erstelle Time Dimension: taxi-bi-project.dimensional.dim_date ---
‚úÖ dim_date erfolgreich erstellt (2015-2026).


In [4]:
# Zelle 4: Dimensional Layer (Das "W√∂rterbuch" f√ºr dein Dashboard)
def create_dimensional_layer_final():
    DIM_DATASET = "dimensional"
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Layer 3 (Dimensionen) in '{DIM_DATASET}' ---")

    # 1. Dataset anlegen falls nicht vorhanden
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)
        print(f"‚úÖ Dataset '{DIM_DATASET}' erstellt.")

    # --- A) DIM VENDOR (Mapping f√ºr vid) ---
    sql_vendor = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_vendor` AS
    SELECT * FROM UNNEST([
        STRUCT('1' as vendor_id, 'Creative Mobile (CMT)' as vendor_name),
        STRUCT('2', 'VeriFone (VTS/Curb)'),
        STRUCT('3', 'Digital Dispatch (DDS)'),
        STRUCT('6', 'Myle'),
        STRUCT('7', 'Helix'),
        STRUCT('99', 'Unknown / Dispatch Base')
    ]);
    """

    # --- B) DIM PAYMENT TYPE (Inkl. deiner Rettung aus 2018) ---
    sql_payment = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type` AS
    SELECT * FROM UNNEST([
        STRUCT(0 as payment_type_id, 'App-Payment / Flex (FHV)' as payment_description),
        STRUCT(1, 'Credit Card'),
        STRUCT(2, 'Cash'),
        STRUCT(3, 'No Charge'),
        STRUCT(4, 'Dispute'),
        STRUCT(5, 'Unknown / Rescued (2018 Fix)'),
        STRUCT(6, 'Voided Trip')
    ]);
    """

    # --- C) DIM RATE CODE ---
    sql_ratecode = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code` AS
    SELECT * FROM UNNEST([
        STRUCT(1 as rate_code_id, 'Standard Rate' as rate_description),
        STRUCT(2, 'JFK Airport'),
        STRUCT(3, 'Newark Airport'),
        STRUCT(4, 'Nassau/Westchester'),
        STRUCT(5, 'Negotiated Fare'),
        STRUCT(6, 'Group Ride'),
        STRUCT(99, 'Unknown / FHV')
    ]);
    """

    # --- D) DIM TRIP TYPE ---
    sql_triptype = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type` AS
    SELECT * FROM UNNEST([
        STRUCT(1 as trip_type_id, 'Street-hail (Yellow/Green)' as trip_description),
        STRUCT(2, 'Dispatch (FHV/Uber/Lyft)' as trip_description)
    ]);
    """

    # --- E) DIM SHARED RIDE (Wichtig f√ºr deinen Hinweis zu Lyft) ---
    sql_shared = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_shared_ride` AS
    SELECT * FROM UNNEST([
        STRUCT(TRUE as sr_flag, 'Shared Ride' as sr_description, 'Note: Lyft counts requests, others only matches.' as sr_note),
        STRUCT(FALSE, 'Private Ride', 'Standard single passenger request.')
    ]);
    """

    # --- F) DIM LOCATION (Zonen-Mapping) ---
    sql_location = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location` AS
    SELECT 
        CAST(LocationID AS INT64) as location_id,
        Borough as borough,
        Zone as zone,
        service_zone,
        ST_GEOGFROMGEOJSON(geometry_json, make_valid => TRUE) as zone_geom
    FROM `{PROJECT_ID}.dimensional.taxi_zones`
    """

    # --- AUSF√úHREN ---
    try:
        client.query(sql_vendor).result()
        client.query(sql_payment).result()
        client.query(sql_ratecode).result()
        client.query(sql_triptype).result()
        client.query(sql_shared).result()
        client.query(sql_location).result()
        print("‚úÖ Alle Dimensionstabellen (Vendor, Payment, Rate, Type, Shared, Location) wurden erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler beim Erstellen der Dimensionen: {e}")

create_dimensional_layer_final()

--- Erstelle Layer 3 (Dimensionen) in 'dimensional' ---
‚úÖ Alle Dimensionstabellen (Vendor, Payment, Rate, Type, Shared, Location) wurden erstellt.


** Create Dimensional **

In [10]:
def create_fact_layer_optimized_v2():
    DIM_DATASET = "dimensional"
    CAN_DATASET = "canonical"
    
    # Referenzen
    source_table = f"{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi"
    fact_table = f"{PROJECT_ID}.{DIM_DATASET}.Fact_Trips"
    print(f"--- Erstelle/Update Optimized Fact Table: {fact_table} ---")

    sql_drop = f"DROP TABLE IF EXISTS `{fact_table}`"
    
    sql_fact = f"""
    CREATE TABLE `{fact_table}`
    -- Monatliche Partitionierung, um das 4000er Limit zu umgehen
    PARTITION BY DATETIME_TRUNC(pickup_datetime, MONTH)
    CLUSTER BY vendor_id, source_system
    AS
    SELECT
        -- IDs & Links
        t.trip_id,
        t.vendor_id,
        t.dispatching_base_nummer,
        
        -- DATE & TIME
        DATE(t.pickup_datetime) AS pickup_date_key, 
        t.pickup_datetime,
        t.dropoff_datetime,
        
        -- Locations
        COALESCE(t.pickup_location_id, 263) AS pickup_location_id, 
        COALESCE(t.dropoff_location_id, 263) AS dropoff_location_id,
        
        -- Payment & Rate
        IFNULL(t.payment_type, 0) AS payment_type_id,
        IFNULL(t.RatecodeID, 99) AS rate_code_id,
        
        -- Trip Type & Shared Ride
        t.Trip_type AS trip_type_id,
        IFNULL(t.SR_Flag, FALSE) AS sr_flag,
        
        -- System Info
        t.source_system,
        t.store_and_fwd_flag,
        t.dq_issue_flag,
        
        -- Measures
        COALESCE(t.passenger_count, 0) AS passenger_count,
        COALESCE(t.trip_distance, 0) AS trip_distance,
        COALESCE(t.fare_amount, 0) AS fare_amount,
        COALESCE(t.tip_amount, 0) AS tip_amount,
        COALESCE(t.total_amount, 0) AS total_amount,
        COALESCE(t.congestion_surcharge, 0) AS congestion_surcharge,
        COALESCE(t.Airport_fee, 0) AS airport_fee,
        
        -- Berechnete Measures
        TIMESTAMP_DIFF(t.dropoff_datetime, t.pickup_datetime, MINUTE) AS duration_minutes

    FROM `{source_table}` t
    """
    
    try:
        # Erst l√∂schen
        client.query(sql_drop).result()
        print("üóëÔ∏è Alte Tabellen-Struktur gel√∂scht.")
        
        # Dann neu erstellen
        client.query(sql_fact).result()
        print("‚úÖ fact_trips erfolgreich mit monatlicher Partitionierung erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_fact_layer_optimized_v2()

--- Erstelle/Update Optimized Fact Table: taxi-bi-project.dimensional.Fact_Trips ---
üóëÔ∏è Alte Tabellen-Struktur gel√∂scht.
‚úÖ fact_trips erfolgreich mit monatlicher Partitionierung erstellt.


In [11]:
# Zelle 7: QA Check - Sind FHV Locations da?
def check_fhv_locations():
    print("--- üîç QA CHECK: Missing Location IDs by System ---")
    
    # Wir pr√ºfen Fact_Trips, da dies die Basis f√ºr deine Analysen ist
    query = f"""
    SELECT 
        source_system,
        COUNT(*) as total_trips,
        
        -- 1. Check auf ECHTE NULL-Werte
        COUNTIF(pickup_location_id IS NULL) as pu_is_null,
        COUNTIF(dropoff_location_id IS NULL) as do_is_null,
        
        -- 2. Check auf 'Unknown' IDs (264=NV, 265=NA)
        -- Das ist bei FHV oft der Fall!
        COUNTIF(pickup_location_id IN (264, 265)) as pu_is_unknown,
        COUNTIF(dropoff_location_id IN (264, 265)) as do_is_unknown,
        
        -- Prozentualer Anteil (NULL + Unknown)
        ROUND((COUNTIF(pickup_location_id IS NULL OR pickup_location_id IN (264, 265)) / COUNT(*)) * 100, 1) as pct_bad_pickup
        
    FROM `{PROJECT_ID}.dimensional.Fact_Trips`
    GROUP BY source_system
    ORDER BY source_system
    """
    
    try:
        df = client.query(query).to_dataframe()
        print(df.to_string(index=False))
        
        # Kurze Analyse f√ºr dich ausgeben
        print("\n--- ANALYSE ---")
        fhv_row = df[df['source_system'] == 'FHV']
        if not fhv_row.empty:
            missing_pct = fhv_row.iloc[0]['pct_bad_pickup']
            if missing_pct > 50:
                print(f"‚ö†Ô∏è ACHTUNG: {missing_pct}% der FHV-Trips haben keine g√ºltige Location!")
                print("   -> Das ist normal f√ºr √§ltere FHV-Daten, schr√§nkt aber Heatmaps ein.")
            else:
                print(f"‚úÖ FHV Daten sehen gut aus ({missing_pct}% Missing).")
                
    except Exception as e:
        print(f"‚ùå Fehler bei der Abfrage: {e}")

check_fhv_locations()

--- üîç QA CHECK: Missing Location IDs by System ---
source_system  total_trips  pu_is_null  do_is_null  pu_is_unknown  do_is_unknown  pct_bad_pickup
          FHV    514630379           0           0       26673183       55183257             5.2
        GREEN     67447832           0           0         115965         250751             0.2
       YELLOW    172066569           0           0        2366782        2612666             1.4

--- ANALYSE ---
‚úÖ FHV Daten sehen gut aus (5.2% Missing).




In [20]:
# Zelle 8: Quality Audit (Der T√úV f√ºr deine Daten)
def check_data_quality_stats():
    print("üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...")
    
    sql_audit = f"""
    SELECT 
        -- 1. Wie viele Fahrten haben wir insgesamt?
        COUNT(*) as total_rows,
    
        -- 4. NULL CHECK (Darf eigentlich nicht sein, au√üer bei optionalen Feldern)
        COUNTIF(pickup_datetime IS NULL) as null_pickups,
        COUNTIF(fare_amount IS NULL) as null_fares,
        
        -- 5. LOGIK CHECK: Credit Card mit 0$ (Dein Sorgenkind)
        COUNTIF(payment_type = 1 AND total_amount = 0) as credit_card_zero_revenue,
        
        -- 6. GEISTER-FAHRTEN (Keine Distanz)
        COUNTIF(trip_distance = 0) as zero_distance_trips

    FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
    """
    
    try:
        df = client.query(sql_audit).to_dataframe()
        print("\n--- ERGEBNISSE ---")
        print(df.T) # Transponieren f√ºr bessere Lesbarkeit
        
        # Automatische Bewertung
        null_errors = df['null_fares'][0] + df['null_pickups'][0]
        if null_errors == 0:
            print("\n‚úÖ TEST BESTANDEN: Keine technischen NULL-Werte in kritischen Spalten.")
        else:
            print(f"\n‚ùå WARNUNG: {null_errors} kritische NULL-Werte gefunden!")
            
    except Exception as e:
        print(f"‚ùå Fehler beim Audit: {e}")

check_data_quality_stats()

üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...

--- ERGEBNISSE ---
                                  0
total_rows                754144780
null_pickups                      0
null_fares                514630379
credit_card_zero_revenue          0
zero_distance_trips         2429564

‚ùå WARNUNG: 514630379 kritische NULL-Werte gefunden!




In [13]:
# Zelle 9: Der "Panic-Check" - Woher kommen die NULLs?
sql_debug = f"""
SELECT 
    source_system,
    COUNT(*) as total_fahrten,
    COUNTIF(fare_amount IS NULL) as fahrten_ohne_preis,
    ROUND(COUNTIF(fare_amount IS NULL) / COUNT(*) * 100, 2) as prozent_ohne_preis
FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
GROUP BY source_system
"""
print(client.query(sql_debug).to_dataframe())

  source_system  total_fahrten  fahrten_ohne_preis  prozent_ohne_preis
0        YELLOW      172066569                   0                 0.0
1         GREEN       67447832                   0                 0.0
2           FHV      514630379           514630379               100.0




In [18]:
def create_final_gold_view():
    print("üèÜ Erstelle die finale Gold-View f√ºr das BI-Reporting...")
    
    view_sql = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.dimensional.view_taxi_bi_final` AS
    SELECT 
        -- Zeit & IDs
        f.trip_id,
        f.pickup_datetime,
        f.dropoff_datetime,
        f.pickup_date_key,
        
        -- Dimensionen: Wer & Wie?
        v.vendor_name,
        p.payment_description as payment_method,
        r.rate_description as rate_type,
        t.trip_description as trip_category,
        
        -- Dimensionen: Wo?
        loc_pu.borough as pickup_borough,
        loc_pu.zone as pickup_zone,
        loc_do.borough as dropoff_borough,
        loc_do.zone as dropoff_zone,
        
        -- Shared Ride Info
        s.sr_description as ride_type,
        
        -- Kennzahlen (Measures)
        f.passenger_count,
        f.trip_distance,
        f.fare_amount,
        f.tip_amount,
        f.total_amount,
        f.duration_minutes,
        
        -- Qualit√§tssicherung
        f.source_system,
        f.dq_issue_flag
        
    FROM `{PROJECT_ID}.dimensional.Fact_Trips` f
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_vendor` v ON f.vendor_id = v.vendor_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_payment_type` p ON f.payment_type_id = p.payment_type_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_rate_code` r ON f.rate_code_id = r.rate_code_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_trip_type` t ON f.trip_type_id = t.trip_type_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_shared_ride` s ON f.sr_flag = s.sr_flag
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_location` loc_pu ON f.pickup_location_id = loc_pu.location_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_location` loc_do ON f.dropoff_location_id = loc_do.location_id
    """
    
    try:
        client.query(view_sql).result()
        print("‚úÖ Gold-View 'view_taxi_bi_final' wurde erstellt.")
        print("üöÄ Dein Projekt ist nun bereit f√ºr die Visualisierung!")
    except Exception as e:
        print(f"‚ùå Fehler bei der View-Erstellung: {e}")

create_final_gold_view()

üèÜ Erstelle die finale Gold-View f√ºr das BI-Reporting...
‚úÖ Gold-View 'view_taxi_bi_final' wurde erstellt.
üöÄ Dein Projekt ist nun bereit f√ºr die Visualisierung!
