In [None]:

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline

# Lesbarkeit in der Exploration erh√∂hen
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 140)

In [12]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
DIM_DATASET = "dimensional"
SOURCE_DATASET = "staging"      
CAN_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "dimensional"

client = bigquery.Client(project=PROJECT_ID)

try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{DIM_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{DIM_DATASET}' nicht finden!")
    raise e

‚úÖ QUELLE GEFUNDEN: Dataset 'dimensional' liegt in Region: 'EU'


In [6]:
# Zelle 4a: GeoJSON Upload (Direkt ins Dimensional Dataset)
import json
import pandas as pd
from google.cloud import bigquery

# CONFIG
GEOJSON_FILE = "NYC_Taxi_Zones.geojson" 
# √ÑNDERUNG: Wir laden es direkt in das 'dimensional' Dataset
STAGING_TABLE = f"{PROJECT_ID}.dimensional.staging_taxi_zones"

def upload_geojson_to_bq():
    print(f"--- Lese GeoJSON '{GEOJSON_FILE}' ---")
    
    try:
        with open(GEOJSON_FILE, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"‚ùå Datei '{GEOJSON_FILE}' nicht gefunden. Bitte hochladen!")
        return

    # Dataset 'dimensional' sicherstellen (falls Zelle 4a vor 4b l√§uft)
    try:
        client.get_dataset(f"{PROJECT_ID}.dimensional")
    except:
        new_ds = bigquery.Dataset(f"{PROJECT_ID}.dimensional")
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)
        print("‚úÖ Dataset 'dimensional' erstellt.")

    rows = []
    for feature in data['features']:
        props = feature['properties']
        geom = feature['geometry']
        
        # Flexibles Mapping
        loc_id = props.get('LocationID') or props.get('objectid') or props.get('OBJECTID')
        
        row = {
            'LocationID': loc_id,
            'Zone': props.get('Zone') or props.get('zone'),
            'Borough': props.get('Borough') or props.get('borough'),
            'service_zone': props.get('service_zone'),
            'geometry_json': json.dumps(geom)
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    
    # FIX: Pandas zwingen, IDs als Zahl zu behandeln
    df['LocationID'] = pd.to_numeric(df['LocationID'], errors='coerce').fillna(0).astype(int)
    
    print(f"Gelesen: {len(df)} Zonen.")

    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        schema=[
            bigquery.SchemaField("LocationID", "INTEGER"),
            bigquery.SchemaField("Zone", "STRING"),
            bigquery.SchemaField("Borough", "STRING"),
            bigquery.SchemaField("service_zone", "STRING"),
            bigquery.SchemaField("geometry_json", "STRING"),
        ]
    )

    try:
        job = client.load_table_from_dataframe(df, STAGING_TABLE, job_config=job_config)
        job.result()
        print(f"‚úÖ Tabelle '{STAGING_TABLE}' erfolgreich hochgeladen.")
    except Exception as e:
        print(f"‚ùå Upload Fehler: {e}")

upload_geojson_to_bq()

--- Lese GeoJSON 'NYC_Taxi_Zones.geojson' ---
Gelesen: 263 Zonen.
‚úÖ Tabelle 'taxi-bi-project.dimensional.staging_taxi_zones' erfolgreich hochgeladen.


In [None]:
# Zelle 4: Dimensional Layer (Komplett mit Smart-Base-Logik)
def create_dimensional_layer_advanced():
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Advanced Layer 3 in '{DIM_DATASET}' ---")

    # Dataset erstellen
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)

    # 1. DIM: Payment & RateCode & TripType (Die einfachen Labels)
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type` AS SELECT * FROM UNNEST([STRUCT(1 as payment_type_id, 'Credit Card' as payment_description), STRUCT(2, 'Cash'), STRUCT(0, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code` AS SELECT * FROM UNNEST([STRUCT(1 as rate_code_id, 'Standard' as rate_description), STRUCT(2, 'JFK'), STRUCT(99, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type` AS SELECT * FROM UNNEST([STRUCT(1 as trip_type_id, 'Street-hail' as trip_description), STRUCT(2, 'Dispatch')])", location=CORRECT_LOCATION).result()

    # 2. DIM: VENDOR 
    print("Erstelle dim_vendor mit Fallback-Logik f√ºr alte Basen...")
    
    sql_vendor_smart = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_vendor`
    (vendor_id STRING, vendor_name STRING, source_system STRING, is_active BOOLEAN)
    AS
    WITH all_existing_bases AS (
        -- 1. Wir schauen in die Vergangenheit: Wer ist alles gefahren?
        SELECT DISTINCT dispatching_base_nummer as vid FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
        WHERE dispatching_base_nummer IS NOT NULL
    ),
    active_bases_list AS (
        -- 2. Deine hochgeladene Datei (Nur die Aktiven)
        -- PASS AUF DIE SPALTEN-NAMEN AUF! (Hier: license_number, name)
        SELECT base_num, base_name FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_fhv_bases`
    )
    
    SELECT 
        hist.vid as vendor_id,
        -- Hier ist der "Smart Logic" Teil:
        COALESCE(act.base_name, CONCAT('Inactive/Unknown Base ', hist.vid)) as vendor_name,
        'FHV' as source_system,
        IF(act.base_name IS NOT NULL, TRUE, FALSE) as is_active
    FROM all_existing_bases hist
    LEFT JOIN active_bases_list act ON hist.vid = act.base_num
    
    UNION ALL
    
    -- Nicht vergessen: Die Yellow/Green Taxis
    SELECT '1', 'Creative Mobile Technologies (CMT)', 'YELLOW/GREEN', TRUE
    UNION ALL
    SELECT '2', 'VeriFone Inc. (Curb)', 'YELLOW/GREEN', TRUE
    """
    
    try:
        client.query(sql_vendor_smart, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_vendor erstellt (Alte Basen wurden erhalten).")
    except Exception as e:
        print(f"‚ö†Ô∏è Fehler bei dim_vendor (Hast du die Datei hochgeladen?): {e}")

    # 3. DIM: LOCATION (Optional - wenn du die Datei hast)
    # Wenn du 'staging_taxi_zones' hochgeladen hast, nimm diesen Block:
    """
    sql_location = f'''
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location` AS
    SELECT 
        LocationID, 
        Borough, 
        Zone, 
        service_zone 
    FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_taxi_zones`
    '''
    client.query(sql_location, location=CORRECT_LOCATION).result()
    """

create_dimensional_layer_advanced()

In [25]:
def create_dim_date():
    DIM_DATASET = "dimensional"
    dim_table_id = f"{PROJECT_ID}.{DIM_DATASET}.dim_date"
    
    print(f"--- Erstelle Time Dimension: {dim_table_id} ---")
    
    # Wir generieren Daten von 2015 bis Ende 2026
    sql_date = f"""
    CREATE OR REPLACE TABLE `{dim_table_id}`
    AS
    SELECT
        d AS date_key,  -- Primary Key f√ºr Joins
        EXTRACT(YEAR FROM d) AS year,
        EXTRACT(MONTH FROM d) AS month,
        FORMAT_DATE('%B', d) AS month_name,  -- z.B. January
        EXTRACT(WEEK FROM d) AS week_of_year,
        EXTRACT(DAYOFWEEK FROM d) AS day_of_week_num, -- 1=Sunday in US, checke BI Tool Setting
        FORMAT_DATE('%A', d) AS day_name,    -- z.B. Monday
        EXTRACT(QUARTER FROM d) AS quarter,
        
        -- N√ºtzliche Flags f√ºr BI
        CASE WHEN EXTRACT(DAYOFWEEK FROM d) IN (1, 7) THEN TRUE ELSE FALSE END AS is_weekend,
        
        -- Optional: Simple Saison-Logik
        CASE 
            WHEN EXTRACT(MONTH FROM d) IN (12, 1, 2) THEN 'Winter'
            WHEN EXTRACT(MONTH FROM d) IN (3, 4, 5) THEN 'Spring'
            WHEN EXTRACT(MONTH FROM d) IN (6, 7, 8) THEN 'Summer'
            ELSE 'Fall' 
        END AS season
        
    FROM UNNEST(GENERATE_DATE_ARRAY('2015-01-01', '2026-12-31')) AS d
    ORDER BY d
    """
    
    try:
        client.query(sql_date).result()
        print("‚úÖ dim_date erfolgreich erstellt (2015-2026).")
    except Exception as e:
        print(f"‚ùå Fehler bei dim_date: {e}")

# Ausf√ºhren
create_dim_date()

--- Erstelle Time Dimension: taxi-bi-project.dimensional.dim_date ---
‚úÖ dim_date erfolgreich erstellt (2015-2026).


In [None]:
# Zelle 4: Dimensional Layer 
def create_dimensional_layer_final():
    DIM_DATASET = "dimensional"
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Layer 3 (Dimensionen) in '{DIM_DATASET}' ---")

    # 1. Dataset anlegen
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)
        print(f"‚úÖ Dataset '{DIM_DATASET}' erstellt.")

    # --- A) STATISCHE W√ñRTERB√úCHER ---
    
    # 1. Payment Type
    sql_payment = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type`
    (payment_type_id INT64, payment_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(0 as payment_type_id, 'Flex / Unknown' as payment_description),
        STRUCT(1, 'Credit Card'),
        STRUCT(2, 'Cash'),
        STRUCT(3, 'No Charge'),
        STRUCT(4, 'Dispute'),
        STRUCT(5, 'Unknown'),
        STRUCT(6, 'Voided Trip')
    ]);
    """

    # 2. Rate Code
    sql_ratecode = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code`
    (rate_code_id INT64, rate_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(1 as rate_code_id, 'Standard Rate' as rate_description),
        STRUCT(2, 'JFK Airport'),
        STRUCT(3, 'Newark Airport'),
        STRUCT(4, 'Nassau/Westchester'),
        STRUCT(5, 'Negotiated Fare'),
        STRUCT(6, 'Group Ride'),
        STRUCT(99, 'Unknown / FHV')
    ]);
    """

    # 3. Trip Type
    sql_triptype = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type`
    (trip_type  _id INT64, trip_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(1 as trip_type_id, 'Street-hail' as trip_description),
        STRUCT(2, 'Dispatch')
    ]);
    """

    # --- C) LOCATION (MAPS) ---
    # Basiert auf Zelle 4a (GeoJSON Upload)
    sql_location = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location`
    (
        LocationID INT64,
        Borough STRING,
        Zone STRING,
        service_zone STRING,
        zone_geom GEOGRAPHY
    )
    AS
    SELECT 
        CAST(LocationID AS INT64) as LocationID,
        Borough,
        Zone,
        service_zone,
        ST_GEOGFROMGEOJSON(geometry_json, make_valid => TRUE) as zone_geom
    FROM `{PROJECT_ID}.dimensional.staging_taxi_zones`
    """

    # --- AUSF√úHREN ---
    try:
        client.query(sql_payment, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_payment_type erstellt.")
        
        client.query(sql_ratecode, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_rate_code erstellt.")
        
        client.query(sql_triptype, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_trip_type erstellt.")
        
        # Location nur erstellen, wenn Upload (Zelle 4a) gemacht wurde
        try:
            client.query(sql_location, location=CORRECT_LOCATION).result()
            print("‚úÖ dim_location erstellt (Maps).")
        except Exception as loc_e:
            print(f"‚ö†Ô∏è Warnung: dim_location konnte nicht erstellt werden (Hast du Zelle 4a ausgef√ºhrt?). Fehler: {loc_e}")
            
    except Exception as e:
        print(f"‚ùå Genereller Fehler: {e}")

create_dimensional_layer_final()

--- Erstelle Layer 3 (Dimensionen) in 'dimensional' ---
‚úÖ dim_payment_type erstellt.
‚úÖ dim_rate_code erstellt.
‚úÖ dim_trip_type erstellt.
‚úÖ dim_vendor erstellt (Nur Static Vendors).
‚úÖ dim_location erstellt (Maps).


** Create Dimensional **

In [26]:
def create_fact_layer_optimized():
    DIM_DATASET = "dimensional"
    CAN_DATASET = "canonical"
    
    # Referenzen
    source_table = f"{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi"
    fact_table = f"{PROJECT_ID}.{DIM_DATASET}.Fact_Trips"
    
    print(f"--- Erstelle Optimized Fact Table: {fact_table} ---")

    sql_fact = f"""
    CREATE OR REPLACE TABLE `{fact_table}`
    -- Partitionierung ist wichtig f√ºr Performance & Kosten!
    PARTITION BY pickup_date_key
    -- Cluster sortiert die Daten physisch f√ºr schnellere Filter
    CLUSTER BY vendor_id, source_system
    AS
    SELECT
        -- IDs
        t.trip_id,
        t.vendor_id,    -- Link zu dim_vendor
        
        -- DATE KEY (Der Link zu dim_date!)
        DATE(t.pickup_datetime) AS pickup_date_key, 
        
        -- Locations (Link zu dim_location)
        COALESCE(t.PULocationID, 264) AS pickup_location_id,   -- 264 = Unknown (NV)
        COALESCE(t.DOLocationID, 264) AS dropoff_location_id,
        
        -- Payment (Link zu dim_payment_type)
        -- FHV hat oft 0, was wir in dim_payment als "Unknown" definiert haben. Passt.
        IFNULL(t.payment_type, 0) AS payment_type_id,
        
        -- Rate Code (Link zu dim_rate_code)
        IFNULL(t.RatecodeID, 99) AS rate_code_id,
        
        -- Trip Type (Link zu dim_trip_type)
        t.Trip_type AS trip_type_id,
        
        -- Degenerate Dimensions (Werte, die direkt in der Faktentabelle bleiben)
        t.source_system,
        t.store_and_fwd_flag,
        t.pickup_datetime, -- Behalten f√ºr genaue Zeitberechnung (Stunden/Minuten)
        t.dropoff_datetime,
        
        -- Measures (Zahlenwerte)
        -- Wir nutzen COALESCE(x, 0), damit Summen in Dashboards funktionieren
        COALESCE(t.passenger_count, 0) AS passenger_count,
        COALESCE(t.trip_distance, 0) AS trip_distance,
        
        -- Finanzen (Wichtig: FHV hat hier oft NULL, wir machen 0 daraus f√ºr Berechnungen)
        COALESCE(t.fare_amount, 0) AS fare_amount,
        COALESCE(t.tip_amount, 0) AS tip_amount,
        COALESCE(t.tolls_amount, 0) AS tolls_amount,
        COALESCE(t.extra, 0) AS extra,
        COALESCE(t.mta_tax, 0) AS mta_tax,
        COALESCE(t.improvement_surcharge, 0) AS improvement_surcharge,
        COALESCE(t.congestion_surcharge, 0) AS congestion_surcharge,
        COALESCE(t.total_amount, 0) AS total_amount,
        
        -- Berechnete Measures
        TIMESTAMP_DIFF(t.dropoff_datetime, t.pickup_datetime, MINUTE) AS duration_minutes

    FROM `{source_table}` t
    """
    
    try:
        # dry_run=False ist Standard, aber gut zu wissen
        job_config = bigquery.QueryJobConfig()
        client.query(sql_fact, job_config=job_config).result()
        print("‚úÖ Fact_Trips erfolgreich erstellt und partitioniert.")
    except Exception as e:
        print(f"‚ùå Fehler bei Fact_Trips: {e}")

create_fact_layer_optimized()

--- Erstelle Optimized Fact Table: taxi-bi-project.dimensional.Fact_Trips ---
‚úÖ Fact_Trips erfolgreich erstellt und partitioniert.


In [16]:
# Zelle 7: QA Check - Sind FHV Locations da?
def check_fhv_locations():
    print("--- üîç QA CHECK: Missing Location IDs by System ---")
    
    # Wir pr√ºfen Fact_Trips, da dies die Basis f√ºr deine Analysen ist
    query = f"""
    SELECT 
        source_system,
        COUNT(*) as total_trips,
        
        -- 1. Check auf ECHTE NULL-Werte
        COUNTIF(pickup_location_id IS NULL) as pu_is_null,
        COUNTIF(dropoff_location_id IS NULL) as do_is_null,
        
        -- 2. Check auf 'Unknown' IDs (264=NV, 265=NA)
        -- Das ist bei FHV oft der Fall!
        COUNTIF(pickup_location_id IN (264, 265)) as pu_is_unknown,
        COUNTIF(dropoff_location_id IN (264, 265)) as do_is_unknown,
        
        -- Prozentualer Anteil (NULL + Unknown)
        ROUND((COUNTIF(pickup_location_id IS NULL OR pickup_location_id IN (264, 265)) / COUNT(*)) * 100, 1) as pct_bad_pickup
        
    FROM `{PROJECT_ID}.dimensional.Fact_Trips`
    GROUP BY source_system
    ORDER BY source_system
    """
    
    try:
        df = client.query(query).to_dataframe()
        print(df.to_string(index=False))
        
        # Kurze Analyse f√ºr dich ausgeben
        print("\n--- ANALYSE ---")
        fhv_row = df[df['source_system'] == 'FHV']
        if not fhv_row.empty:
            missing_pct = fhv_row.iloc[0]['pct_bad_pickup']
            if missing_pct > 50:
                print(f"‚ö†Ô∏è ACHTUNG: {missing_pct}% der FHV-Trips haben keine g√ºltige Location!")
                print("   -> Das ist normal f√ºr √§ltere FHV-Daten, schr√§nkt aber Heatmaps ein.")
            else:
                print(f"‚úÖ FHV Daten sehen gut aus ({missing_pct}% Missing).")
                
    except Exception as e:
        print(f"‚ùå Fehler bei der Abfrage: {e}")

check_fhv_locations()

--- üîç QA CHECK: Missing Location IDs by System ---




source_system  total_trips  pu_is_null  do_is_null  pu_is_unknown  do_is_unknown  pct_bad_pickup
          FHV    435300973   119085087    38701963       25732302       51104279            33.3
        GREEN     55595206           0           0          58820         182338             0.1
       YELLOW     53094835           0           0         516584         612980             1.0

--- ANALYSE ---
‚úÖ FHV Daten sehen gut aus (33.3% Missing).


In [23]:
# Zelle 8: Quality Audit (Der T√úV f√ºr deine Daten)
def check_data_quality_stats():
    print("üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...")
    
    sql_audit = f"""
    SELECT 
        -- 1. Wie viele Fahrten haben wir insgesamt?
        COUNT(*) as total_rows,
    
        -- 4. NULL CHECK (Darf eigentlich nicht sein, au√üer bei optionalen Feldern)
        COUNTIF(pickup_datetime IS NULL) as null_pickups,
        COUNTIF(fare_amount IS NULL) as null_fares,
        
        -- 5. LOGIK CHECK: Credit Card mit 0$ (Dein Sorgenkind)
        COUNTIF(payment_type = 1 AND total_amount = 0) as credit_card_zero_revenue,
        
        -- 6. GEISTER-FAHRTEN (Keine Distanz)
        COUNTIF(trip_distance = 0) as zero_distance_trips

    FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
    """
    
    try:
        df = client.query(sql_audit).to_dataframe()
        print("\n--- ERGEBNISSE ---")
        print(df.T) # Transponieren f√ºr bessere Lesbarkeit
        
        # Automatische Bewertung
        null_errors = df['null_fares'][0] + df['null_pickups'][0]
        if null_errors == 0:
            print("\n‚úÖ TEST BESTANDEN: Keine technischen NULL-Werte in kritischen Spalten.")
        else:
            print(f"\n‚ùå WARNUNG: {null_errors} kritische NULL-Werte gefunden!")
            
    except Exception as e:
        print(f"‚ùå Fehler beim Audit: {e}")

check_data_quality_stats()

üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...





--- ERGEBNISSE ---
                                  0
total_rows                543991014
null_pickups                      0
null_fares                435300973
credit_card_zero_revenue          2
zero_distance_trips               0

‚ùå WARNUNG: 435300973 kritische NULL-Werte gefunden!


In [24]:
# Zelle 9: Der "Panic-Check" - Woher kommen die NULLs?
sql_debug = f"""
SELECT 
    source_system,
    COUNT(*) as total_fahrten,
    COUNTIF(fare_amount IS NULL) as fahrten_ohne_preis,
    ROUND(COUNTIF(fare_amount IS NULL) / COUNT(*) * 100, 2) as prozent_ohne_preis
FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
GROUP BY source_system
"""
print(client.query(sql_debug).to_dataframe())

  source_system  total_fahrten  fahrten_ohne_preis  prozent_ohne_preis
0        YELLOW       53094835                   0                 0.0
1           FHV      435300973           435300973               100.0
2         GREEN       55595206                   0                 0.0


