In [2]:

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline

# Lesbarkeit in der Exploration erh√∂hen
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.width", 140)

In [3]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
DIM_DATASET = "dimensional"
SOURCE_DATASET = "staging"      
CAN_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "dimensional"

client = bigquery.Client(project=PROJECT_ID)

try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{DIM_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{DIM_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{DIM_DATASET}' nicht finden!")
    raise e

‚úÖ QUELLE GEFUNDEN: Dataset 'dimensional' liegt in Region: 'EU'


In [31]:
import pandas as pd
from google.cloud import bigquery
import re

def upload_reference_csvs():
    files_to_upload = {
        "ref_black_car": "/Users/lania/Documents/GitHub/bi_project_task2/notebook_dimensional/current_black_car_bases.csv",
        "ref_luxury_limo": "/Users/lania/Documents/GitHub/bi_project_task2/notebook_dimensional/current_luxury_limousine_bases.csv",
        "ref_community_car": "/Users/lania/Documents/GitHub/bi_project_task2/notebook_dimensional/current_community_car_service_bases.csv"
    }

    for table_name, file_path in files_to_upload.items():
        try:
            # 1. Datei laden mit utf-8-sig um das unsichtbare BOM-Zeichen zu entfernen
            df = pd.read_csv(
                file_path, 
                sep=None, 
                quotechar="'", 
                engine='python', 
                encoding='utf-8-sig', # <--- WICHTIG: Entfernt das Ôªø Zeichen
                on_bad_lines='warn'
            )
            
            # Daten von restlichen Hochkommas befreien
            for col in df.select_dtypes(include=['object']):
                df[col] = df[col].str.strip("'")

            # 2. SPALTEN REINIGEN (Extrem sicher)
            # Wir behalten nur Buchstaben, Zahlen und Unterstriche. 
            # Alles andere (inkl. unsichtbarer Zeichen) wird entfernt.
            new_columns = []
            for c in df.columns:
                # Ersetze Leerzeichen/Bindestriche durch Unterstrich
                clean_name = c.replace(' ', '_').replace('-', '_').replace('/', '_')
                # Entferne alle Zeichen, die nicht A-Z, 0-9 oder _ sind
                clean_name = re.sub(r'[^a-zA-Z0-9_]', '', clean_name)
                # BigQuery Spalten d√ºrfen nicht mit einer Zahl beginnen
                if clean_name[0].isdigit():
                    clean_name = 'f_' + clean_name
                new_columns.append(clean_name)
            
            df.columns = new_columns
            
            # 3. Upload nach BigQuery
            table_id = f"{PROJECT_ID}.{DIM_DATASET}.{table_name}"
            
            job_config = bigquery.LoadJobConfig(
                write_disposition="WRITE_TRUNCATE",
                autodetect=True
            )
            
            client.load_table_from_dataframe(df, table_id, job_config=job_config).result()
            
            print(f"‚úÖ Tabelle erfolgreich erstellt: {table_id}")
            print(f"   Bereinigte Spalten: {list(df.columns[:3])}...")
            
        except Exception as e:
            print(f"‚ùå Fehler beim Hochladen von {file_path}:")
            print(f"   Details: {e}")

# Ausf√ºhren
upload_reference_csvs()

‚úÖ Tabelle erfolgreich erstellt: taxi-bi-project.dimensional.ref_black_car
   Bereinigte Spalten: ['LICENSEE_NUMBER', 'NAME_OF_LICENSEE', 'ALTERNATE_NAME_OF_LICENSEE']...
‚úÖ Tabelle erfolgreich erstellt: taxi-bi-project.dimensional.ref_luxury_limo
   Bereinigte Spalten: ['LICENSEE_NUMBER', 'NAME_OF_LICENSEE', 'ALTERNATE_NAME_OF_LICENSEE']...
‚úÖ Tabelle erfolgreich erstellt: taxi-bi-project.dimensional.ref_community_car
   Bereinigte Spalten: ['LICENSEE_NUMBER', 'NAME_OF_LICENSEE', 'ALTERNATE_NAME_OF_LICENSEE']...


In [13]:
def create_dim_date():
    DIM_DATASET = "dimensional"
    dim_table_id = f"{PROJECT_ID}.{DIM_DATASET}.dim_date"
    
    print(f"--- Erstelle Time Dimension: {dim_table_id} ---")
    
    # Wir generieren Daten von 2015 bis Ende 2026
    # Uhrzeit - Stunde 
    sql_date = f"""
    CREATE OR REPLACE TABLE `taxi-bi-project.dimensional.dim_datetime` AS
    SELECT
        -- Generiert einen Timestamp f√ºr jede Stunde
        stunde AS datetime_key,

        EXTRACT(YEAR FROM stunde) AS year,
        EXTRACT(MONTH FROM stunde) AS month,
        FORMAT_TIMESTAMP('%B', stunde) AS month_name, 
        EXTRACT(QUARTER FROM stunde) AS quarter,
        EXTRACT(DAYOFWEEK FROM stunde) AS day_of_week,
        -- Hier ist die neue Stunden-Spalte (0-23)
        EXTRACT(HOUR FROM stunde) AS hour,
        CASE WHEN EXTRACT(DAYOFWEEK FROM stunde) IN (1, 7) THEN TRUE ELSE FALSE END AS is_weekend
    FROM 
        UNNEST(
            GENERATE_TIMESTAMP_ARRAY(
                '2010-01-01 00:00:00', 
                '2025-12-31 23:00:00', 
                INTERVAL 1 HOUR
            )
        ) AS stunde;
    """
    
    try:
        client.query(sql_date).result()
        print("‚úÖ dim_date erfolgreich erstellt (2015-2026).")
    except Exception as e:
        print(f"‚ùå Fehler bei dim_date: {e}")

# Ausf√ºhren
create_dim_date()

--- Erstelle Time Dimension: taxi-bi-project.dimensional.dim_date ---
‚úÖ dim_date erfolgreich erstellt (2015-2026).


In [None]:
def create_fact_layer_v3():
    DIM_DATASET = "dimensional"
    CAN_DATASET = "canonical"
    
    # Pfade definieren
    source_table = f"{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi"
    fact_table = f"{PROJECT_ID}.{DIM_DATASET}.Fact_Trips"
    dim_base_table = f"{PROJECT_ID}.{DIM_DATASET}.dim_base"

    print(f"--- Starte Layer-Optimierung: {fact_table} ---")

    # 1. SCHRITT: Erstellung der dim_base (dein SQL)
    sql_dim_base = f"""
    CREATE OR REPLACE TABLE `{dim_base_table}` AS
    SELECT
        LICENSEE_NUMBER AS base_number,
        NAME_OF_LICENSEE AS base_name,
        base_type,
        TRUE AS is_currently_active
    FROM (
        SELECT LICENSEE_NUMBER, NAME_OF_LICENSEE, 'Black Car' as base_type FROM `{PROJECT_ID}.{DIM_DATASET}.ref_black_car`
        UNION ALL
        SELECT LICENSEE_NUMBER, NAME_OF_LICENSEE, 'Luxury Limo' FROM `{PROJECT_ID}.{DIM_DATASET}.ref_luxury_limo`
        UNION ALL
        SELECT LICENSEE_NUMBER, NAME_OF_LICENSEE, 'Community Car' FROM `{PROJECT_ID}.{DIM_DATASET}.ref_community_car`
    );
    """

    # 2. SCHRITT: Fact Table mit JOIN zur dim_base
    sql_fact = f"""
    CREATE OR REPLACE TABLE `{fact_table}`
    PARTITION BY DATETIME_TRUNC(pickup_datetime, MONTH)
    CLUSTER BY base_type, vendor_id
    AS
    SELECT
        -- IDs & Links
        t.trip_id,
        t.vendor_id,
        t.dispatching_base_nummer,
        
        -- Mapping-Daten aus der dim_base
        IFNULL(b.base_name, 'Unknown') AS base_name,
        IFNULL(b.base_type, 'Unknown') AS base_type,
        IFNULL(b.is_currently_active, FALSE) AS is_active_base,
        
        -- DATE & TIME
        DATE(t.pickup_datetime) AS pickup_date_key, 
        t.pickup_datetime,
        t.dropoff_datetime,
        
        -- Locations
        COALESCE(t.pickup_location_id, 263) AS pickup_location_id, 
        COALESCE(t.dropoff_location_id, 263) AS dropoff_location_id,
        
        -- Payment & Rate
        IFNULL(t.payment_type, 0) AS payment_type_id,
        IFNULL(t.RatecodeID, 99) AS rate_code_id,
        
        -- System Info
        t.source_system,
        t.dq_issue_flag,
        
        -- Measures
        COALESCE(t.passenger_count, 0) AS passenger_count,
        COALESCE(t.trip_distance, 0) AS trip_distance,
        COALESCE(t.total_amount, 0) AS total_amount,
        
        -- Berechnete Measures
        TIMESTAMP_DIFF(t.dropoff_datetime, t.pickup_datetime, MINUTE) AS duration_minutes

    FROM `{source_table}` t
    -- Verkn√ºpfung zur neuen Dimension
    LEFT JOIN `{dim_base_table}` b 
      ON t.dispatching_base_nummer = b.base_number
    """
    
    try:
        print("üõ†Ô∏è Erstelle Dimension: dim_base...")
        client.query(sql_dim_base).result()
        
        print("üöÄ Erstelle Fact Table: Fact_Trips...")
        client.query(sql_fact).result()
        
        print(f"‚úÖ Erfolg! Fact_Trips ist nun mit dim_base gemappt.")
        
    except Exception as e:
        print(f"‚ùå Fehler bei der Verarbeitung: {e}")

# Ausf√ºhren
create_fact_layer_v3()

--- Erstelle Layer 3 (Dimensionen) in 'dimensional' ---
‚úÖ Alle Dimensionstabellen (Vendor, Payment, Rate, Type, Shared, Location) wurden erstellt.


** Create Dimensional **

In [22]:
# Zelle 7: QA Check - Sind FHV Locations da?
def check_fhv_locations():
    print("--- üîç QA CHECK: Missing Location IDs by System ---")
    
    # Wir pr√ºfen Fact_Trips, da dies die Basis f√ºr deine Analysen ist
    query = f"""
    SELECT 
        source_system,
        COUNT(*) as total_trips,
        
        -- 1. Check auf ECHTE NULL-Werte
        COUNTIF(pickup_location_id IS NULL) as pu_is_null,
        COUNTIF(dropoff_location_id IS NULL) as do_is_null,
        
        -- 2. Check auf 'Unknown' IDs (264=NV, 265=NA)
        -- Das ist bei FHV oft der Fall!
        COUNTIF(pickup_location_id IN (264, 265)) as pu_is_unknown,
        COUNTIF(dropoff_location_id IN (264, 265)) as do_is_unknown,
        
        -- Prozentualer Anteil (NULL + Unknown)
        ROUND((COUNTIF(pickup_location_id IS NULL OR pickup_location_id IN (264, 265)) / COUNT(*)) * 100, 1) as pct_bad_pickup
        
    FROM `{PROJECT_ID}.dimensional.Fact_Trips`
    GROUP BY source_system
    ORDER BY source_system
    """
    
    try:
        df = client.query(query).to_dataframe()
        print(df.to_string(index=False))
        
        # Kurze Analyse f√ºr dich ausgeben
        print("\n--- ANALYSE ---")
        fhv_row = df[df['source_system'] == 'FHV']
        if not fhv_row.empty:
            missing_pct = fhv_row.iloc[0]['pct_bad_pickup']
            if missing_pct > 50:
                print(f"‚ö†Ô∏è ACHTUNG: {missing_pct}% der FHV-Trips haben keine g√ºltige Location!")
                print("   -> Das ist normal f√ºr √§ltere FHV-Daten, schr√§nkt aber Heatmaps ein.")
            else:
                print(f"‚úÖ FHV Daten sehen gut aus ({missing_pct}% Missing).")
                
    except Exception as e:
        print(f"‚ùå Fehler bei der Abfrage: {e}")

check_fhv_locations()

--- üîç QA CHECK: Missing Location IDs by System ---
source_system  total_trips  pu_is_null  do_is_null  pu_is_unknown  do_is_unknown  pct_bad_pickup
          FHV    510080635           0           0      148848097       99316788            29.2
        GREEN     67448057           0           0         115970         250756             0.2
       YELLOW    172064982           0           0        2748871        3007044             1.6

--- ANALYSE ---
‚úÖ FHV Daten sehen gut aus (29.2% Missing).




In [52]:
def create_fact_layer():
    DIM_DATASET = "dimensional"
    CAN_DATASET = "canonical"
    
    source_table = f"{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi"
    fact_table = f"{PROJECT_ID}.{DIM_DATASET}.Fact_Trips"
    dim_base_table = f"{PROJECT_ID}.{DIM_DATASET}.dim_base"

    print(f"--- üõ†Ô∏è Repariere Fact Table: {fact_table} ---")

    sql_fact = f"""
    CREATE TABLE `{fact_table}`
    PARTITION BY DATETIME_TRUNC(pickup_datetime, MONTH)
    CLUSTER BY base_type, vendor_id
    AS
    SELECT
        t.trip_id,
        t.vendor_id,
        t.dispatching_base_nummer,
        
        -- Mapping-Daten aus der dim_base (direkt in die Fact f√ºr Performance)
        IFNULL(b.base_name, 'Unknown') AS base_name,
        IFNULL(b.base_type, 'Unknown') AS base_type,
        IFNULL(b.is_currently_active, FALSE) AS is_active_base,
        
        -- DATE & TIME
        DATE(t.pickup_datetime) AS pickup_date_key, 
        t.pickup_datetime,
        t.dropoff_datetime,
        
        -- Locations
        COALESCE(t.pickup_location_id, 263) AS pickup_location_id, 
        COALESCE(t.dropoff_location_id, 263) AS dropoff_location_id,
        
        -- Payment, Rate & TYPES (WICHTIG: Diese haben in der View gefehlt!)
        IFNULL(t.payment_type, 0) AS payment_type_id,
        IFNULL(t.RatecodeID, 99) AS rate_code_id,
        t.Trip_type AS trip_type_id,           -- <--- Wieder hinzugef√ºgt
        IFNULL(t.SR_Flag, FALSE) AS sr_flag,   -- <--- Wieder hinzugef√ºgt
        
        -- Measures
        COALESCE(t.passenger_count, 0) AS passenger_count,
        COALESCE(t.trip_distance, 0) AS trip_distance,
        COALESCE(t.total_amount, 0) AS total_amount,
        COALESCE(t.fare_amount, 0) AS fare_amount,
        COALESCE(t.tip_amount, 0) AS tip_amount,
        
        -- System & Berechnung
        t.source_system,
        t.dq_issue_flag,
        TIMESTAMP_DIFF(t.dropoff_datetime, t.pickup_datetime, MINUTE) AS duration_minutes
    FROM `{source_table}` t
    LEFT JOIN `{dim_base_table}` b 
      ON t.dispatching_base_nummer = b.base_number
    """
    
    try:
        print(f"üóëÔ∏è L√∂sche alte Tabelle {fact_table}...")
        client.query(f"DROP TABLE IF EXISTS `{fact_table}`").result()
        
        print("üöÄ Erstelle Fact Table neu (inkl. trip_type_id)...")
        client.query(sql_fact).result()
        print(f"‚úÖ Fact_Trips ist jetzt vollst√§ndig.")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_fact_layer()

--- üõ†Ô∏è Repariere Fact Table: taxi-bi-project.dimensional.Fact_Trips ---
üóëÔ∏è L√∂sche alte Tabelle taxi-bi-project.dimensional.Fact_Trips...
üöÄ Erstelle Fact Table neu (inkl. trip_type_id)...
‚úÖ Fact_Trips ist jetzt vollst√§ndig.


In [53]:
def create_final_gold_view_v3():
    print("üèÜ Erstelle die finale Gold-View...")
    
    view_sql = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.dimensional.view_taxi_bi_final` AS
    SELECT 
        f.trip_id,
        f.pickup_datetime,
        f.dropoff_datetime,
        f.pickup_date_key,
        
        -- Basis Info
        f.dispatching_base_nummer as base_id,
        f.base_name,
        f.base_type,
        f.is_active_base,
        
        -- Dimensionen Joins
        v.vendor_name,
        p.payment_description as payment_method,
        r.rate_description as rate_type,
        t.trip_description as trip_category,
        s.sr_description as ride_type,
        
        -- Locations
        loc_pu.borough as pickup_borough,
        loc_pu.zone as pickup_zone,
        loc_do.borough as dropoff_borough,
        loc_do.zone as dropoff_zone,
        
        -- Kennzahlen
        f.passenger_count,
        f.trip_distance,
        f.fare_amount,
        f.tip_amount,
        f.total_amount,
        f.duration_minutes,
        
        f.source_system,
        f.dq_issue_flag
        
    FROM `{PROJECT_ID}.dimensional.Fact_Trips` f
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_vendor` v ON f.vendor_id = v.vendor_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_payment_type` p ON f.payment_type_id = p.payment_type_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_rate_code` r ON f.rate_code_id = r.rate_code_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_trip_type` t ON f.trip_type_id = t.trip_type_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_shared_ride` s ON f.sr_flag = s.sr_flag
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_location` loc_pu ON f.pickup_location_id = loc_pu.location_id
    LEFT JOIN `{PROJECT_ID}.dimensional.dim_location` loc_do ON f.dropoff_location_id = loc_do.location_id
    """
    
    try:
        client.query(view_sql).result()
        print("‚úÖ Gold-View 'view_taxi_bi_final' erfolgreich erstellt!")
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

create_final_gold_view_v3()

üèÜ Erstelle die finale Gold-View...
‚úÖ Gold-View 'view_taxi_bi_final' erfolgreich erstellt!


In [46]:
# Zelle 9: Der "Panic-Check" - Woher kommen die NULLs?
sql_debug = f"""
SELECT 
    source_system,
    COUNT(*) as total_fahrten,
    COUNTIF(fare_amount IS NULL) as fahrten_ohne_preis,
    ROUND(COUNTIF(fare_amount IS NULL) / COUNT(*) * 100, 2) as prozent_ohne_preis
FROM `{PROJECT_ID}.{CAN_DATASET}.canonical_unified_taxi`
GROUP BY source_system
"""
print(client.query(sql_debug).to_dataframe())

  source_system  total_fahrten  fahrten_ohne_preis  prozent_ohne_preis
0        YELLOW      172064982                   0                 0.0
1         GREEN       67448057                   0                 0.0
2           FHV      510080635           510080635               100.0


