In [None]:

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline

# Lesbarkeit in der Exploration erh√∂hen
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 140)

In [1]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
DIM_DATASET = "dimensional"
SOURCE_DATASET = "staging"      
TARGET_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "dimensional"

client = bigquery.Client(project=PROJECT_ID)

try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{DIM_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{DIM_DATASET}' nicht finden!")
    raise e

‚úÖ QUELLE GEFUNDEN: Dataset 'dimensional' liegt in Region: 'EU'


In [6]:
# Zelle 4a: GeoJSON Upload (Direkt ins Dimensional Dataset)
import json
import pandas as pd
from google.cloud import bigquery

# CONFIG
GEOJSON_FILE = "NYC_Taxi_Zones.geojson" 
# √ÑNDERUNG: Wir laden es direkt in das 'dimensional' Dataset
STAGING_TABLE = f"{PROJECT_ID}.dimensional.staging_taxi_zones"

def upload_geojson_to_bq():
    print(f"--- Lese GeoJSON '{GEOJSON_FILE}' ---")
    
    try:
        with open(GEOJSON_FILE, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"‚ùå Datei '{GEOJSON_FILE}' nicht gefunden. Bitte hochladen!")
        return

    # Dataset 'dimensional' sicherstellen (falls Zelle 4a vor 4b l√§uft)
    try:
        client.get_dataset(f"{PROJECT_ID}.dimensional")
    except:
        new_ds = bigquery.Dataset(f"{PROJECT_ID}.dimensional")
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)
        print("‚úÖ Dataset 'dimensional' erstellt.")

    rows = []
    for feature in data['features']:
        props = feature['properties']
        geom = feature['geometry']
        
        # Flexibles Mapping
        loc_id = props.get('LocationID') or props.get('objectid') or props.get('OBJECTID')
        
        row = {
            'LocationID': loc_id,
            'Zone': props.get('Zone') or props.get('zone'),
            'Borough': props.get('Borough') or props.get('borough'),
            'service_zone': props.get('service_zone'),
            'geometry_json': json.dumps(geom)
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    
    # FIX: Pandas zwingen, IDs als Zahl zu behandeln
    df['LocationID'] = pd.to_numeric(df['LocationID'], errors='coerce').fillna(0).astype(int)
    
    print(f"Gelesen: {len(df)} Zonen.")

    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        schema=[
            bigquery.SchemaField("LocationID", "INTEGER"),
            bigquery.SchemaField("Zone", "STRING"),
            bigquery.SchemaField("Borough", "STRING"),
            bigquery.SchemaField("service_zone", "STRING"),
            bigquery.SchemaField("geometry_json", "STRING"),
        ]
    )

    try:
        job = client.load_table_from_dataframe(df, STAGING_TABLE, job_config=job_config)
        job.result()
        print(f"‚úÖ Tabelle '{STAGING_TABLE}' erfolgreich hochgeladen.")
    except Exception as e:
        print(f"‚ùå Upload Fehler: {e}")

upload_geojson_to_bq()

--- Lese GeoJSON 'NYC_Taxi_Zones.geojson' ---
Gelesen: 263 Zonen.
‚úÖ Tabelle 'taxi-bi-project.dimensional.staging_taxi_zones' erfolgreich hochgeladen.


In [None]:
# Zelle 4: Dimensional Layer (Komplett mit Smart-Base-Logik)
def create_dimensional_layer_advanced():
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Advanced Layer 3 in '{DIM_DATASET}' ---")

    # Dataset erstellen
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)

    # 1. DIM: Payment & RateCode & TripType (Die einfachen Labels)
    # ... (Code wie oben, hier abgek√ºrzt der √úbersicht halber) ...
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type` AS SELECT * FROM UNNEST([STRUCT(1 as payment_type_id, 'Credit Card' as payment_description), STRUCT(2, 'Cash'), STRUCT(0, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code` AS SELECT * FROM UNNEST([STRUCT(1 as rate_code_id, 'Standard' as rate_description), STRUCT(2, 'JFK'), STRUCT(99, 'Unknown')])", location=CORRECT_LOCATION).result()
    client.query(f"CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type` AS SELECT * FROM UNNEST([STRUCT(1 as trip_type_id, 'Street-hail' as trip_description), STRUCT(2, 'Dispatch')])", location=CORRECT_LOCATION).result()

    # 2. DIM: VENDOR (Die intelligente Logik!)
    print("Erstelle dim_vendor mit Fallback-Logik f√ºr alte Basen...")
    
    sql_vendor_smart = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_vendor`
    (vendor_id STRING, vendor_name STRING, source_system STRING, is_active BOOLEAN)
    AS
    WITH all_existing_bases AS (
        -- 1. Wir schauen in die Vergangenheit: Wer ist alles gefahren?
        SELECT DISTINCT dispatching_base_nummer as vid FROM `{PROJECT_ID}.{TARGET_DATASET}.canonical_unified_taxi`
        WHERE dispatching_base_nummer IS NOT NULL
    ),
    active_bases_list AS (
        -- 2. Deine hochgeladene Datei (Nur die Aktiven)
        -- PASS AUF DIE SPALTEN-NAMEN AUF! (Hier: license_number, name)
        SELECT base_num, base_name FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_fhv_bases`
    )
    
    SELECT 
        hist.vid as vendor_id,
        -- Hier ist der "Smart Logic" Teil:
        COALESCE(act.base_name, CONCAT('Inactive/Unknown Base ', hist.vid)) as vendor_name,
        'FHV' as source_system,
        IF(act.base_name IS NOT NULL, TRUE, FALSE) as is_active
    FROM all_existing_bases hist
    LEFT JOIN active_bases_list act ON hist.vid = act.base_num
    
    UNION ALL
    
    -- Nicht vergessen: Die Yellow/Green Taxis
    SELECT '1', 'Creative Mobile Technologies (CMT)', 'YELLOW/GREEN', TRUE
    UNION ALL
    SELECT '2', 'VeriFone Inc. (Curb)', 'YELLOW/GREEN', TRUE
    """
    
    try:
        client.query(sql_vendor_smart, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_vendor erstellt (Alte Basen wurden erhalten).")
    except Exception as e:
        print(f"‚ö†Ô∏è Fehler bei dim_vendor (Hast du die Datei hochgeladen?): {e}")

    # 3. DIM: LOCATION (Optional - wenn du die Datei hast)
    # Wenn du 'staging_taxi_zones' hochgeladen hast, nimm diesen Block:
    """
    sql_location = f'''
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location` AS
    SELECT 
        LocationID, 
        Borough, 
        Zone, 
        service_zone 
    FROM `{PROJECT_ID}.{SOURCE_DATASET}.staging_taxi_zones`
    '''
    client.query(sql_location, location=CORRECT_LOCATION).result()
    """

create_dimensional_layer_advanced()

In [7]:
# Zelle 4: Dimensional Layer (Final Clean Version)
def create_dimensional_layer_final():
    DIM_DATASET = "dimensional"
    dim_dataset_id = f"{PROJECT_ID}.{DIM_DATASET}"
    
    print(f"--- Erstelle Layer 3 (Dimensionen) in '{DIM_DATASET}' ---")

    # 1. Dataset anlegen
    try:
        client.get_dataset(dim_dataset_id)
    except:
        new_ds = bigquery.Dataset(dim_dataset_id)
        new_ds.location = CORRECT_LOCATION
        client.create_dataset(new_ds)
        print(f"‚úÖ Dataset '{DIM_DATASET}' erstellt.")

    # --- A) STATISCHE W√ñRTERB√úCHER ---
    
    # 1. Payment Type
    sql_payment = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_payment_type`
    (payment_type_id INT64, payment_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(0 as payment_type_id, 'Flex / Unknown' as payment_description),
        STRUCT(1, 'Credit Card'),
        STRUCT(2, 'Cash'),
        STRUCT(3, 'No Charge'),
        STRUCT(4, 'Dispute'),
        STRUCT(5, 'Unknown'),
        STRUCT(6, 'Voided Trip')
    ]);
    """

    # 2. Rate Code
    sql_ratecode = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_rate_code`
    (rate_code_id INT64, rate_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(1 as rate_code_id, 'Standard Rate' as rate_description),
        STRUCT(2, 'JFK Airport'),
        STRUCT(3, 'Newark Airport'),
        STRUCT(4, 'Nassau/Westchester'),
        STRUCT(5, 'Negotiated Fare'),
        STRUCT(6, 'Group Ride'),
        STRUCT(99, 'Unknown / FHV')
    ]);
    """

    # 3. Trip Type
    sql_triptype = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_trip_type`
    (trip_type_id INT64, trip_description STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT(1 as trip_type_id, 'Street-hail' as trip_description),
        STRUCT(2, 'Dispatch')
    ]);
    """

    # --- B) VENDOR (REDUZIERT) ---
    # Nur die offiziellen Taxi-Technologie-Partner. 
    # Wir lassen FHV-Basen komplett raus, wie gew√ºnscht.
    sql_vendor = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_vendor`
    (vendor_id STRING, vendor_name STRING, source_system STRING)
    AS
    SELECT * FROM UNNEST([
        STRUCT('1' as vendor_id, 'Creative Mobile Technologies (CMT)' as vendor_name, 'YELLOW/GREEN' as source_system),
        STRUCT('2', 'VeriFone Inc. (Curb)', 'YELLOW/GREEN'),
        STRUCT('6', 'Myle', 'GREEN'),
        STRUCT('7', 'Helix', 'GREEN')
    ]);
    """

    # --- C) LOCATION (MAPS) ---
    # Basiert auf Zelle 4a (GeoJSON Upload)
    sql_location = f"""
    CREATE OR REPLACE TABLE `{dim_dataset_id}.dim_location`
    (
        LocationID INT64,
        Borough STRING,
        Zone STRING,
        service_zone STRING,
        zone_geom GEOGRAPHY
    )
    AS
    SELECT 
        CAST(LocationID AS INT64) as LocationID,
        Borough,
        Zone,
        service_zone,
        ST_GEOGFROMGEOJSON(geometry_json, make_valid => TRUE) as zone_geom
    FROM `{PROJECT_ID}.dimensional.staging_taxi_zones`
    """

    # --- AUSF√úHREN ---
    try:
        client.query(sql_payment, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_payment_type erstellt.")
        
        client.query(sql_ratecode, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_rate_code erstellt.")
        
        client.query(sql_triptype, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_trip_type erstellt.")
        
        client.query(sql_vendor, location=CORRECT_LOCATION).result()
        print("‚úÖ dim_vendor erstellt (Nur Static Vendors).")
        
        # Location nur erstellen, wenn Upload (Zelle 4a) gemacht wurde
        try:
            client.query(sql_location, location=CORRECT_LOCATION).result()
            print("‚úÖ dim_location erstellt (Maps).")
        except Exception as loc_e:
            print(f"‚ö†Ô∏è Warnung: dim_location konnte nicht erstellt werden (Hast du Zelle 4a ausgef√ºhrt?). Fehler: {loc_e}")
            
    except Exception as e:
        print(f"‚ùå Genereller Fehler: {e}")

create_dimensional_layer_final()

--- Erstelle Layer 3 (Dimensionen) in 'dimensional' ---
‚úÖ dim_payment_type erstellt.
‚úÖ dim_rate_code erstellt.
‚úÖ dim_trip_type erstellt.
‚úÖ dim_vendor erstellt (Nur Static Vendors).
‚úÖ dim_location erstellt (Maps).


In [8]:
# Zelle 8: Quality Audit (Der T√úV f√ºr deine Daten)
def check_data_quality_stats():
    print("üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...")
    
    sql_audit = f"""
    SELECT 
        -- 1. Wie viele Fahrten haben wir insgesamt?
        COUNT(*) as total_rows,
        
        -- 2. Wie viele sind "Technisch Sauber" (Kein DQ Flag)?
        COUNTIF(dq_issue_flag = FALSE) as clean_rows,
        
        -- 3. Wie viele haben "Business-Probleme" (DQ Flag = TRUE)?
        COUNTIF(dq_issue_flag = TRUE) as flagged_rows,
        
        -- 4. NULL CHECK (Darf eigentlich nicht sein, au√üer bei optionalen Feldern)
        COUNTIF(pickup_datetime IS NULL) as null_pickups,
        COUNTIF(fare_amount IS NULL) as null_fares,
        
        -- 5. LOGIK CHECK: Credit Card mit 0$ (Dein Sorgenkind)
        COUNTIF(payment_type = 1 AND total_amount = 0) as credit_card_zero_revenue,
        
        -- 6. GEISTER-FAHRTEN (Keine Distanz)
        COUNTIF(trip_distance = 0) as zero_distance_trips

    FROM `{PROJECT_ID}.{TARGET_DATASET}.canonical_unified_taxi`
    """
    
    try:
        df = client.query(sql_audit).to_dataframe()
        print("\n--- ERGEBNISSE ---")
        print(df.T) # Transponieren f√ºr bessere Lesbarkeit
        
        # Automatische Bewertung
        null_errors = df['null_fares'][0] + df['null_pickups'][0]
        if null_errors == 0:
            print("\n‚úÖ TEST BESTANDEN: Keine technischen NULL-Werte in kritischen Spalten.")
        else:
            print(f"\n‚ùå WARNUNG: {null_errors} kritische NULL-Werte gefunden!")
            
    except Exception as e:
        print(f"‚ùå Fehler beim Audit: {e}")

check_data_quality_stats()

üïµÔ∏è‚Äç‚ôÇÔ∏è STARTE QUALIT√ÑTS-CHECK (Canonical Layer)...





--- ERGEBNISSE ---
                                  0
total_rows                688622858
clean_rows                685082821
flagged_rows                3540037
null_pickups                      0
null_fares                515372815
credit_card_zero_revenue      25021
zero_distance_trips         2551622

‚ùå WARNUNG: 515372815 kritische NULL-Werte gefunden!


In [9]:
# Zelle 9: Der "Panic-Check" - Woher kommen die NULLs?
sql_debug = f"""
SELECT 
    source_system,
    COUNT(*) as total_fahrten,
    COUNTIF(fare_amount IS NULL) as fahrten_ohne_preis,
    ROUND(COUNTIF(fare_amount IS NULL) / COUNT(*) * 100, 2) as prozent_ohne_preis
FROM `{PROJECT_ID}.{TARGET_DATASET}.canonical_unified_taxi`
GROUP BY source_system
"""
print(client.query(sql_debug).to_dataframe())



  source_system  total_fahrten  fahrten_ohne_preis  prozent_ohne_preis
0        YELLOW      105211397                   0                 0.0
1         GREEN       68038646                   0                 0.0
2           FHV      515372815           515372815               100.0
