In [9]:
# Zelle 1: Setup & Config
from google.cloud import bigquery
import pandas as pd
import logging

# Config
PROJECT_ID = "taxi-bi-project"  # <-- Deine Projekt-ID
DIM_DATASET = "dimensional"     # Quelle (Star Schema)
AGG_DATASET = "aggregational"       # Ziel (Data Marts)

client = bigquery.Client(project=PROJECT_ID)

# Hilfsfunktion, um Datasets zu finden/erstellen
def create_dataset_if_not_exists(dataset_id):
    full_dataset_id = f"{PROJECT_ID}.{dataset_id}"
    try:
        client.get_dataset(full_dataset_id)
        print(f"✅ Ziel-Dataset gefunden: {full_dataset_id}")
    except:
        print(f"Erstelle neues Dataset: {full_dataset_id} ...")
        # Wir holen uns die Region vom Quell-Dataset, damit alles gleich liegt (EU/US)
        src_ds = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
        new_ds = bigquery.Dataset(full_dataset_id)
        new_ds.location = src_ds.location
        client.create_dataset(new_ds)
        print(f"✅ Dataset erstellt (Region: {src_ds.location})")

create_dataset_if_not_exists(AGG_DATASET)

✅ Ziel-Dataset gefunden: taxi-bi-project.aggregational


In [10]:
# Zelle 2: Aggregation 1 - Monthly KPIs (Management View)
# Diese Tabelle beantwortet: "Wie entwickeln sich Umsatz und Fahrtenzahlen?"

def create_monthly_kpis():
    print("--- 1. Erstelle Tabelle: agg_monthly_kpis ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_monthly_kpis`
    AS
    SELECT
        -- Dimensionen (Woran wollen wir schneiden?)
        d.year,
        d.month,
        d.month_name,
        d.quarter,
        f.source_system,      -- Yellow vs Green vs FHV
        v.vendor_name,        -- Creative Mobile vs Uber/Lyft Bases
        p.payment_description, -- Cash vs Credit
        
        -- Metriken (Hier wird gerechnet!)
        COUNT(f.trip_id) AS total_trips,
        
        -- Summen (Runden auf 2 Nachkommastellen spart Speicher und sieht besser aus)
        ROUND(SUM(f.total_amount), 2) AS total_revenue,
        ROUND(SUM(f.fare_amount), 2) AS total_fare,
        ROUND(SUM(f.tip_amount), 2) AS total_tips,
        
        -- Durchschnitte (KPIs)
        ROUND(AVG(f.total_amount), 2) AS avg_ticket_size,
        ROUND(AVG(f.trip_distance), 2) AS avg_distance_miles,
        ROUND(AVG(f.duration_minutes), 1) AS avg_duration_min

    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    
    -- JOINs zum Star Schema
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_vendor` v ON f.vendor_id = v.vendor_id
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_payment_type` p ON f.payment_type_id = p.payment_type_id

    -- FILTER:
    -- Wir wollen hier nur "echte" Fahrten für die Statistik.
    -- Wir schließen 0$-Fahrten (Geister) aus.
    -- Aber: Deine 3.80$ Fahrt (Short Trip) bleibt drin, weil total_amount > 0!
    WHERE f.total_amount > 0
    
    GROUP BY 1, 2, 3, 4, 5, 6, 7
    ORDER BY year DESC, month DESC, total_revenue DESC
    """
    
    try:
        job = client.query(sql)
        job.result() # Warten auf Fertigstellung
        print("✅ agg_monthly_kpis erfolgreich erstellt.")
    except Exception as e:
        print(f"❌ Fehler: {e}")

create_monthly_kpis()

--- 1. Erstelle Tabelle: agg_monthly_kpis ---
✅ agg_monthly_kpis erfolgreich erstellt.


In [11]:
# Zelle 3: Aggregation 2 - Geo Stats (Map View)
# Diese Tabelle beantwortet: "Welche Stadtteile sind am lukrativsten?"

def create_geo_stats():
    print("--- 2. Erstelle Tabelle: agg_geo_stats ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_geo_stats`
    AS
    SELECT
        -- Zeit (nur grob, damit die Karte filterbar bleibt)
        d.year,
        d.quarter,
        
        -- Ort (Startpunkt der Fahrt)
        loc.Borough AS pickup_borough,
        loc.Zone AS pickup_zone,
        loc.service_zone,
        
        -- System (um z.B. nur Green Taxis anzuzeigen)
        f.source_system,
        
        -- Metriken
        COUNT(f.trip_id) AS pickup_count,
        ROUND(SUM(f.total_amount), 0) AS total_revenue_generated, -- Runden auf Ganze Zahlen reicht hier
        ROUND(AVG(f.tip_amount), 2) AS avg_tip_here

    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    
    -- JOINs
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_location` loc ON f.pickup_location_id = loc.LocationID
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key

    -- FILTER: Nur umsatzrelevante Fahrten
    WHERE f.total_amount > 0
    
    GROUP BY 1, 2, 3, 4, 5, 6
    ORDER BY pickup_count DESC
    """
    
    try:
        job = client.query(sql)
        job.result()
        print("✅ agg_geo_stats erfolgreich erstellt.")
    except Exception as e:
        print(f"❌ Fehler: {e}")

create_geo_stats()

--- 2. Erstelle Tabelle: agg_geo_stats ---
✅ agg_geo_stats erfolgreich erstellt.


In [12]:
# Zelle 4: Quality Check (Kurzer Blick auf das Ergebnis)
# Prüfen, ob die Tabellen gefüllt sind und die Geisterfahrten weg sind

def check_aggregation():
    print("\n--- CHECK: Monthly KPIs (Top 5 Rows) ---")
    query_kpi = f"""
    SELECT year, month_name, source_system, total_trips, total_revenue, avg_ticket_size 
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_monthly_kpis` 
    ORDER BY total_trips DESC 
    LIMIT 5
    """
    print(client.query(query_kpi).to_dataframe().to_string(index=False))

    print("\n--- CHECK: Geo Stats (Top 5 Zones) ---")
    query_geo = f"""
    SELECT pickup_borough, pickup_zone, source_system, pickup_count, total_revenue_generated
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_geo_stats`
    WHERE pickup_zone != 'NV' -- Wir ignorieren kurz die Unknowns
    ORDER BY pickup_count DESC
    LIMIT 5
    """
    print(client.query(query_geo).to_dataframe().to_string(index=False))

check_aggregation()


--- CHECK: Monthly KPIs (Top 5 Rows) ---




 year month_name source_system  total_trips  total_revenue  avg_ticket_size
 2018       June        YELLOW      1396753    22862004.94            16.37
 2016       June        YELLOW      1386290    22314334.60            16.10
 2017       June        YELLOW      1382504    22400107.36            16.20
 2019       June        YELLOW      1367537    26334860.06            19.26
 2016       June        YELLOW      1361801    21859032.59            16.05

--- CHECK: Geo Stats (Top 5 Zones) ---
pickup_borough           pickup_zone source_system  pickup_count  total_revenue_generated
     Manhattan Upper East Side South        YELLOW        324359                6529648.0
     Manhattan Upper East Side South        YELLOW        323263                6382352.0
     Manhattan Upper East Side South        YELLOW        295549                5665265.0
        Queens           JFK Airport        YELLOW        292752               23459130.0
     Manhattan        Midtown Center        YELLOW    

