In [1]:
# Zelle 1: Setup & Config
from google.cloud import bigquery
import pandas as pd
import logging

# Config
PROJECT_ID = "taxi-bi-project"  
DIM_DATASET = "dimensional"     # Quelle (Star Schema)
AGG_DATASET = "aggregational"       # Ziel (Data Marts)

client = bigquery.Client(project=PROJECT_ID)

# Hilfsfunktion, um Datasets zu finden/erstellen
def create_dataset_if_not_exists(dataset_id):
    full_dataset_id = f"{PROJECT_ID}.{dataset_id}"
    try:
        client.get_dataset(full_dataset_id)
        print(f"‚úÖ Ziel-Dataset gefunden: {full_dataset_id}")
    except:
        print(f"Erstelle neues Dataset: {full_dataset_id} ...")
        # Wir holen uns die Region vom Quell-Dataset, damit alles gleich liegt (EU/US)
        src_ds = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
        new_ds = bigquery.Dataset(full_dataset_id)
        new_ds.location = src_ds.location
        client.create_dataset(new_ds)
        print(f"‚úÖ Dataset erstellt (Region: {src_ds.location})")

create_dataset_if_not_exists(AGG_DATASET)

‚úÖ Ziel-Dataset gefunden: taxi-bi-project.aggregational


In [2]:
# Zelle 2: Aggregation 1 - Monthly KPIs (Management View)
# Diese Tabelle beantwortet: "Wie entwickeln sich Umsatz und Fahrtenzahlen?"

def create_monthly_kpis():
    print("--- 1. Erstelle Tabelle: agg_monthly_kpis ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_monthly_kpis`
    AS
    SELECT
        -- Dimensionen (Woran wollen wir schneiden?)
        d.year,
        d.month,
        d.month_name,
        d.quarter,
        f.source_system,      -- Yellow vs Green vs FHV
        v.vendor_name,        -- Creative Mobile vs Uber/Lyft Bases
        p.payment_description, -- Cash vs Credit
        
        -- Metriken (Hier wird gerechnet!)
        COUNT(f.trip_id) AS total_trips,
        
        -- Summen (Runden auf 2 Nachkommastellen spart Speicher und sieht besser aus)
        ROUND(SUM(f.total_amount), 2) AS total_revenue,
        ROUND(SUM(f.fare_amount), 2) AS total_fare,
        ROUND(SUM(f.tip_amount), 2) AS total_tips,
        
        -- Durchschnitte (KPIs)
        ROUND(AVG(f.total_amount), 2) AS avg_ticket_size,
        ROUND(AVG(f.trip_distance), 2) AS avg_distance_miles,
        ROUND(AVG(f.duration_minutes), 1) AS avg_duration_min

    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    
    -- JOINs zum Star Schema
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_vendor` v ON f.vendor_id = v.vendor_id
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_payment_type` p ON f.payment_type_id = p.payment_type_id

    -- FILTER:
    -- Wir wollen hier nur "echte" Fahrten f√ºr die Statistik.
    -- Wir schlie√üen 0$-Fahrten (Geister) aus.
    -- Aber: Deine 3.80$ Fahrt (Short Trip) bleibt drin, weil total_amount > 0!
    WHERE f.total_amount > 0
    
    GROUP BY 1, 2, 3, 4, 5, 6, 7
    ORDER BY year DESC, month DESC, total_revenue DESC
    """
    
    try:
        job = client.query(sql)
        job.result() # Warten auf Fertigstellung
        print("‚úÖ agg_monthly_kpis erfolgreich erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_monthly_kpis()

--- 1. Erstelle Tabelle: agg_monthly_kpis ---
‚úÖ agg_monthly_kpis erfolgreich erstellt.


In [3]:
# Zelle 3: Aggregation 2 - Geo Stats (Map View)
# Diese Tabelle beantwortet: "Welche Stadtteile sind am lukrativsten?"

def create_geo_stats():
    print("--- 2. Erstelle Tabelle: agg_geo_stats ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_geo_stats`
    AS
    SELECT
        -- Zeit (nur grob, damit die Karte filterbar bleibt)
        d.year,
        d.quarter,
        
        -- Ort (Startpunkt der Fahrt)
        loc.Borough AS pickup_borough,
        loc.Zone AS pickup_zone,
        loc.service_zone,
        
        -- System (um z.B. nur Green Taxis anzuzeigen)
        f.source_system,
        
        -- Metriken
        COUNT(f.trip_id) AS pickup_count,
        ROUND(SUM(f.total_amount), 0) AS total_revenue_generated, -- Runden auf Ganze Zahlen reicht hier
        ROUND(AVG(f.tip_amount), 2) AS avg_tip_here

    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    
    -- JOINs
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_location` loc ON f.pickup_location_id = loc.LocationID
    LEFT JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key

    -- FILTER: Nur umsatzrelevante Fahrten
    WHERE f.total_amount > 0
    
    GROUP BY 1, 2, 3, 4, 5, 6
    ORDER BY pickup_count DESC
    """
    
    try:
        job = client.query(sql)
        job.result()
        print("‚úÖ agg_geo_stats erfolgreich erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_geo_stats()

--- 2. Erstelle Tabelle: agg_geo_stats ---
‚úÖ agg_geo_stats erfolgreich erstellt.


In [4]:
# Zelle 4: Quality Check (Kurzer Blick auf das Ergebnis)
# Pr√ºfen, ob die Tabellen gef√ºllt sind und die Geisterfahrten weg sind

def check_aggregation():
    print("\n--- CHECK: Monthly KPIs (Top 5 Rows) ---")
    query_kpi = f"""
    SELECT year, month_name, source_system, total_trips, total_revenue, avg_ticket_size 
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_monthly_kpis` 
    ORDER BY total_trips DESC 
    LIMIT 5
    """
    print(client.query(query_kpi).to_dataframe().to_string(index=False))

    print("\n--- CHECK: Geo Stats (Top 5 Zones) ---")
    query_geo = f"""
    SELECT pickup_borough, pickup_zone, source_system, pickup_count, total_revenue_generated
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_geo_stats`
    WHERE pickup_zone != 'NV' -- Wir ignorieren kurz die Unknowns
    ORDER BY pickup_count DESC
    LIMIT 5
    """
    print(client.query(query_geo).to_dataframe().to_string(index=False))

check_aggregation()


--- CHECK: Monthly KPIs (Top 5 Rows) ---




 year month_name source_system  total_trips  total_revenue  avg_ticket_size
 2018       June        YELLOW      1396753    22862004.94            16.37
 2016       June        YELLOW      1386290    22314334.60            16.10
 2017       June        YELLOW      1382504    22400107.36            16.20
 2019       June        YELLOW      1367537    26334860.06            19.26
 2016       June        YELLOW      1361801    21859032.59            16.05

--- CHECK: Geo Stats (Top 5 Zones) ---
pickup_borough           pickup_zone source_system  pickup_count  total_revenue_generated
     Manhattan Upper East Side South        YELLOW        324359                6529648.0
     Manhattan Upper East Side South        YELLOW        323263                6382352.0
     Manhattan Upper East Side South        YELLOW        295549                5665265.0
        Queens           JFK Airport        YELLOW        292752               23459130.0
     Manhattan        Midtown Center        YELLOW    



In [5]:
def create_time_trends():
    print("--- 3. Erstelle Tabelle: agg_time_trends ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_time_trends` AS
    SELECT
        d.year,
        d.day_name,      -- Montag, Dienstag...
        d.day_of_week_num, -- 1-7 (zum Sortieren wichtig!)
        d.is_weekend,    -- True/False
        EXTRACT(HOUR FROM f.pickup_datetime) AS hour_of_day,
        
        f.source_system,
        
        -- Metriken
        COUNT(*) AS trip_count,
        ROUND(AVG(f.total_amount), 2) AS avg_fare,
        ROUND(AVG(f.trip_distance), 2) AS avg_distance
        
    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key
    
    WHERE f.total_amount > 0
    GROUP BY 1, 2, 3, 4, 5, 6
    ORDER BY day_of_week_num, hour_of_day
    """
    try:
        client.query(sql).result()
        print("‚úÖ agg_time_trends erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_time_trends()

--- 3. Erstelle Tabelle: agg_time_trends ---
‚úÖ agg_time_trends erstellt.


In [6]:
def create_route_stats():
    print("--- 4. Erstelle Tabelle: agg_route_stats ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_route_stats` AS
    SELECT
        d.year,
        
        -- VON -> NACH
        pu.Borough AS pickup_borough,
        do.Borough AS dropoff_borough,
        
        -- Optional: Wenn du es feiner willst, nimm die Zonen dazu (wird aber gr√∂√üer)
        -- pu.Zone as pickup_zone,
        -- do.Zone as dropoff_zone,
        
        COUNT(*) AS trip_count,
        ROUND(AVG(f.total_amount), 2) AS avg_cost,
        ROUND(AVG(f.duration_minutes), 1) AS avg_duration
        
    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_location` pu ON f.pickup_location_id = pu.LocationID
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_location` do ON f.dropoff_location_id = do.LocationID
    
    WHERE f.total_amount > 0 
      AND pu.Borough != 'Unknown' 
      AND do.Borough != 'Unknown'
      
    GROUP BY 1, 2, 3
    ORDER BY trip_count DESC
    """
    try:
        client.query(sql).result()
        print("‚úÖ agg_route_stats erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_route_stats()

--- 4. Erstelle Tabelle: agg_route_stats ---
‚úÖ agg_route_stats erstellt.


In [7]:
def create_airport_stats():
    print("--- 5. Erstelle Tabelle: agg_airport_trips ---")
    
    sql = f"""
    CREATE OR REPLACE TABLE `{PROJECT_ID}.{AGG_DATASET}.agg_airport_trips` AS
    SELECT
        d.year,
        d.month_name,
        f.source_system,
        
        -- War es eine Fahrt ZUM oder VOM Flughafen?
        CASE 
            WHEN rc.rate_description LIKE '%JFK%' OR rc.rate_description LIKE '%Newark%' THEN 'Airport Rate'
            ELSE 'Standard Rate to Airport Zone'
        END AS trip_category,
        
        COUNT(*) AS total_trips,
        ROUND(AVG(f.total_amount), 2) AS avg_ticket,
        ROUND(AVG(f.tip_amount), 2) AS avg_tip
        
    FROM `{PROJECT_ID}.{DIM_DATASET}.Fact_Trips` f
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_date` d ON f.pickup_date_key = d.date_key
    JOIN `{PROJECT_ID}.{DIM_DATASET}.dim_rate_code` rc ON f.rate_code_id = rc.rate_code_id
    
    -- Location Filter: 132=JFK, 138=LaGuardia, 1=Newark
    WHERE (f.pickup_location_id IN (132, 138, 1) OR f.dropoff_location_id IN (132, 138, 1))
      AND f.total_amount > 0
      
    GROUP BY 1, 2, 3, 4
    """
    try:
        client.query(sql).result()
        print("‚úÖ agg_airport_trips erstellt.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_airport_stats()

--- 5. Erstelle Tabelle: agg_airport_trips ---
‚úÖ agg_airport_trips erstellt.


In [9]:
def check_new_marts():
    # Stelle sicher, dass die Variablen passen
    PROJECT_ID = "taxi-bi-project"
    AGG_DATASET = "aggregational"
    
    print("--- üìä FINAL DATA MART CHECK ---")

    # 1. RUSH HOUR (Wann ist am meisten los?)
    print("\nüïê TOP 5 ZEITFENSTER (agg_time_trends):")
    # Wir sortieren nach trip_count, um die gesch√§ftigsten Stunden zu sehen
    sql_time = f"""
    SELECT day_name, hour_of_day, source_system, trip_count, avg_fare
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_time_trends`
    ORDER BY trip_count DESC
    LIMIT 5
    """
    try:
        df_time = client.query(sql_time).to_dataframe()
        print(df_time.to_string(index=False))
    except Exception as e:
        print(f"Fehler: {e}")

    # 2. ROUTEN (Wer f√§hrt von wo nach wo?)
    print("\nüìç TOP 5 ROUTEN (agg_route_stats):")
    sql_routes = f"""
    SELECT pickup_borough, dropoff_borough, trip_count, avg_cost
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_route_stats`
    ORDER BY trip_count DESC
    LIMIT 5
    """
    try:
        df_routes = client.query(sql_routes).to_dataframe()
        print(df_routes.to_string(index=False))
    except Exception as e:
        print(f"Fehler: {e}")

    # 3. FLUGHAFEN (Die Cash Cows)
    print("\n‚úàÔ∏è FLUGHAFEN STATS (agg_airport_trips):")
    sql_air = f"""
    SELECT year, trip_category, source_system, total_trips, avg_ticket
    FROM `{PROJECT_ID}.{AGG_DATASET}.agg_airport_trips`
    ORDER BY total_trips DESC
    LIMIT 5
    """
    try:
        df_air = client.query(sql_air).to_dataframe()
        print(df_air.to_string(index=False))
    except Exception as e:
        print(f"Fehler: {e}")

check_new_marts()

--- üìä FINAL DATA MART CHECK ---

üïê TOP 5 ZEITFENSTER (agg_time_trends):




 day_name  hour_of_day source_system  trip_count  avg_fare
Wednesday           18        YELLOW      233087     26.26
 Thursday           18        YELLOW      231472     26.54
  Tuesday           18        YELLOW      226557     26.04
Wednesday           17        YELLOW      225059     28.30
 Thursday           17        YELLOW      223336     28.47

üìç TOP 5 ROUTEN (agg_route_stats):




pickup_borough dropoff_borough  trip_count  avg_cost
     Manhattan       Manhattan    20908052     20.34
     Manhattan       Manhattan     7358395     12.41
     Manhattan       Manhattan     6957972     12.28
     Manhattan       Manhattan     6223453     12.24
     Manhattan       Manhattan     5457148     12.20

‚úàÔ∏è FLUGHAFEN STATS (agg_airport_trips):
 year                 trip_category source_system  total_trips  avg_ticket
 2019 Standard Rate to Airport Zone        YELLOW       151537       43.21
 2018 Standard Rate to Airport Zone        YELLOW       147304       43.45
 2017 Standard Rate to Airport Zone        YELLOW       146878       43.80
 2016 Standard Rate to Airport Zone        YELLOW       146395       42.37
 2023 Standard Rate to Airport Zone        YELLOW       141019       67.60


