In [None]:
# Zelle 1: Setup & Config
from google.cloud import bigquery
from google.api_core.exceptions import NotFound

import pandas as pd
import uuid
import datetime
import matplotlib.pyplot as plt

# sorgt daf√ºr, dass Plots im Notebook angezeigt werden
%matplotlib inline


In [None]:
# Zelle 1 & 2: Setup mit automatischer Regionen-Korrektur
from google.cloud import bigquery
from google.api_core.exceptions import NotFound
import logging
import uuid
import datetime

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# --- CONFIG ---
PROJECT_ID = "taxi-bi-project" # Deine ID aus dem Log
SOURCE_DATASET = "staging"     # <--- Laut deinem Fehlerlog hei√üt es "staging"!
TARGET_DATASET = "canonical"   

# Tabellen
TARGET_TABLE = "canonical_unified_taxi"
ERROR_TABLE = "error_records"
LOG_TABLE = "etl_process_log"

client = bigquery.Client(project=PROJECT_ID)

print("--- DIAGNOSE & REPARATUR ---")
try:
    src_ds_ref = client.get_dataset(f"{PROJECT_ID}.{SOURCE_DATASET}")
    CORRECT_LOCATION = src_ds_ref.location
    print(f"‚úÖ QUELLE GEFUNDEN: Dataset '{SOURCE_DATASET}' liegt in Region: '{CORRECT_LOCATION}'")
except Exception as e:
    print(f"‚ùå KRITISCHER FEHLER: Konnte Quell-Dataset '{SOURCE_DATASET}' nicht finden!")
    print("Bitte pr√ºfe: Hei√üt dein Dataset in BigQuery wirklich 'staging' oder 'taxi_dwh'?")
    raise e

# 2. Pr√ºfen, ob das Ziel-Dataset 'canonical' falsch liegt
target_dataset_id = f"{PROJECT_ID}.{TARGET_DATASET}"
try:
    tgt_ds = client.get_dataset(target_dataset_id)
    if tgt_ds.location != CORRECT_LOCATION:
        print(f"‚ö†Ô∏è KONFLIKT: Ziel '{TARGET_DATASET}' ist in '{tgt_ds.location}', muss aber nach '{CORRECT_LOCATION}'.")
        print("   L√∂sche falsches Dataset...")
        client.delete_dataset(target_dataset_id, delete_contents=True, not_found_ok=True)
        print("   Gel√∂scht. Wird neu erstellt.")
        tgt_ds = None
except NotFound:
    tgt_ds = None

# 3. Ziel-Dataset korrekt neu erstellen
if not tgt_ds:
    new_ds = bigquery.Dataset(target_dataset_id)
    new_ds.location = CORRECT_LOCATION # <--- Hier zwingen wir die richtige Region!
    client.create_dataset(new_ds)
    print(f"ZIEL ERSTELLT: Dataset '{TARGET_DATASET}' erfolgreich in Region '{CORRECT_LOCATION}' angelegt.")
else:
    print(f"OK: Ziel-Dataset liegt bereits korrekt in '{CORRECT_LOCATION}'.")

# Globale Variablen aktualisieren
table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{TARGET_TABLE}"
error_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{ERROR_TABLE}"
log_table_ref = f"{PROJECT_ID}.{TARGET_DATASET}.{LOG_TABLE}"

print("\nReady. Bitte jetzt Zelle 3 ausf√ºhren.")

Python(78274) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


--- DIAGNOSE & REPARATUR ---
‚úÖ QUELLE GEFUNDEN: Dataset 'staging' liegt in Region: 'EU'
OK: Ziel-Dataset liegt bereits korrekt in 'EU'.

Ready. Bitte jetzt Zelle 3 ausf√ºhren.


In [35]:
# Zelle 2: Schema Definition (Update gem√§√ü Schema.csv)
def create_all_tables():
    # 1. Erweitertes Schema (Inkl. neuer Base-Nummern & Flags)
    base_schema = [
        # Metadaten
        bigquery.SchemaField("trip_id", "STRING", description="Unique ID"),
        bigquery.SchemaField("source_system", "STRING"),
        bigquery.SchemaField("load_date", "TIMESTAMP"),
        bigquery.SchemaField("vendor_id", "STRING"), 
        
        #Base Nummern
        bigquery.SchemaField("Affiliated_base_number", "STRING"),
        bigquery.SchemaField("dispatching_base_nummer", "STRING"),
        
        # Zeit & Ort (Namen zur√ºck ge√§ndert gem√§√ü Schema.csv)
        bigquery.SchemaField("pickup_datetime", "TIMESTAMP"),
        bigquery.SchemaField("dropoff_datetime", "TIMESTAMP"),
        bigquery.SchemaField("pickup_location_id", "INTEGER"), 
        bigquery.SchemaField("dropoff_location_id", "INTEGER"), 
        
        # Details
        bigquery.SchemaField("passenger_count", "INTEGER"),
        bigquery.SchemaField("trip_distance", "FLOAT"),
        bigquery.SchemaField("store_and_fwd_flag", "STRING"),
        
        # RateCode & TripType
        bigquery.SchemaField("RatecodeID", "INTEGER"),
        bigquery.SchemaField("Trip_type", "INTEGER"),
        
        bigquery.SchemaField("SR_Flag", "BOOLEAN", description="1=Shared, NULL=Non-Shared"), # War is_shared_ride
        
        # Finanzen (Basis)
        bigquery.SchemaField("fare_amount", "FLOAT"),
        bigquery.SchemaField("tip_amount", "FLOAT"),
        bigquery.SchemaField("total_amount", "FLOAT"),
        bigquery.SchemaField("payment_type", "INTEGER"),
        
        # Finanzen (Erweitert)
        bigquery.SchemaField("extra", "FLOAT"),
        bigquery.SchemaField("mta_tax", "FLOAT"),
        bigquery.SchemaField("tolls_amount", "FLOAT"),
        bigquery.SchemaField("improvement_surcharge", "FLOAT"),
        bigquery.SchemaField("congestion_surcharge", "FLOAT"),
        bigquery.SchemaField("Airport_fee", "FLOAT"), # Gro√ügeschrieben laut Schema
        bigquery.SchemaField("ehail_fee", "FLOAT"),   # NEU
        
        # Validation Flag
        bigquery.SchemaField("dq_issue_flag", "BOOLEAN") 
    ]

    # Error Tabelle Schema
    error_schema = base_schema + [bigquery.SchemaField("rejection_reason", "STRING")]

    tables_to_create = [
        (table_ref, base_schema),
        (error_table_ref, error_schema)
    ]

    for t_ref, t_schema in tables_to_create:
        try:
            # Clean Slate: Alte Version l√∂schen
            client.delete_table(t_ref, not_found_ok=True)
            
            t = bigquery.Table(t_ref, schema=t_schema)
            t.time_partitioning = bigquery.TimePartitioning(field="pickup_datetime")
            client.create_table(t)
            print(f"‚úÖ Tabelle aktualisiert: {t_ref.split('.')[-1]}")
        except Exception as e:
            print(f"‚ùå Fehler: {e}")

create_all_tables()

‚úÖ Tabelle aktualisiert: canonical_unified_taxi
‚úÖ Tabelle aktualisiert: error_records


In [41]:
# Zelle 3: ETL Pipeline (Final: Quality Checks + Deduplication + Partitioning)
def run_etl_split_logic():
    print(f"Starte ETL Job (Task 2a Compliance) in Region: {CORRECT_LOCATION}...") 
    
    START_DATE = '2015-01-01'
    
    query = f"""
    BEGIN
        -- 1. HARD CLEANUP (Erlaubt Schema-√Ñnderung)
        DROP TABLE IF EXISTS `{table_ref}`;
        DROP TABLE IF EXISTS `{error_table_ref}`;

        -- 2. Temp Table mit Logik & Deduplizierung
        CREATE TEMP TABLE temp_all_trips AS
        WITH raw_combined AS (
            -- (A) YELLOW
            SELECT 
                'YELLOW' as source, CAST(VendorID AS STRING) as vid,
                CAST(tpep_pickup_datetime AS TIMESTAMP) as t_pick, CAST(tpep_dropoff_datetime AS TIMESTAMP) as t_drop,
                PULocationID as loc_pu, DOLocationID as loc_do,
                IFNULL(passenger_count, 1) as pax, trip_distance as dist, IFNULL(store_and_fwd_flag, 'N') as flag,
                IFNULL(RatecodeID, 99) as ratecode, NULL as trip_type,
                NULL as aff_base, NULL as disp_base,
                IFNULL(payment_type, 0) as pay_type, FALSE as shared,
                GREATEST(IFNULL(fare_amount, 0), 0) as fare, 
                GREATEST(IFNULL(tip_amount, 0), 0) as tip, 
                GREATEST(IFNULL(total_amount, 0), 0) as total,
                GREATEST(IFNULL(extra, 0), 0) as extra,
                GREATEST(IFNULL(mta_tax, 0), 0) as mta,
                GREATEST(IFNULL(tolls_amount, 0), 0) as tolls,
                GREATEST(IFNULL(improvement_surcharge, 0), 0) as imp_sur,
                GREATEST(IFNULL(congestion_surcharge, 0), 0) as cong_sur,
                GREATEST(IFNULL(airport_fee, 0), 0) as air_fee,
                NULL as ehail
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.yellow_staging_unified`
            
            UNION ALL
            
            -- (B) GREEN
            SELECT 
                'GREEN', CAST(VendorID AS STRING),
                CAST(lpep_pickup_datetime AS TIMESTAMP), CAST(lpep_dropoff_datetime AS TIMESTAMP),
                PULocationID, DOLocationID,
                IFNULL(passenger_count, 1), trip_distance, IFNULL(store_and_fwd_flag, 'N'),
                IFNULL(RatecodeID, 99), CAST(trip_type AS INT64),
                NULL, NULL,
                IFNULL(payment_type, 0), FALSE,
                GREATEST(IFNULL(fare_amount, 0), 0), 
                GREATEST(IFNULL(tip_amount, 0), 0), 
                GREATEST(IFNULL(total_amount, 0), 0),
                GREATEST(IFNULL(extra, 0), 0),
                GREATEST(IFNULL(mta_tax, 0), 0),
                GREATEST(IFNULL(tolls_amount, 0), 0),
                GREATEST(IFNULL(improvement_surcharge, 0), 0),
                GREATEST(IFNULL(congestion_surcharge, 0), 0),
                0, GREATEST(IFNULL(ehail_fee, 0), 0)
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.green_staging_unified`

            UNION ALL

            -- (C) FHV
            SELECT 
                'FHV', dispatching_base_num,
                CAST(pickup_datetime AS TIMESTAMP), CAST(dropOff_datetime AS TIMESTAMP),
                PULocationID, DOLocationID,
                NULL, NULL, 'N',
                99, NULL,
                Affiliated_base_number, dispatching_base_num,
                0, IF(CAST(SR_Flag AS STRING)='1', TRUE, FALSE),
                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.fhv_staging_unified`
        ),
        
        calculated_trips AS (
            SELECT
                -- Eindeutige ID generieren (Criterion 3 Vorbereitung)
                FARM_FINGERPRINT(CONCAT(source, CAST(t_pick AS STRING), IFNULL(vid,''))) as trip_id,
                source as source_system,
                CURRENT_TIMESTAMP() as load_date,
                vid as vendor_id,
                aff_base as Affiliated_base_number,
                disp_base as dispatching_base_nummer,
                t_pick as pickup_datetime,
                t_drop as dropoff_datetime,
                loc_pu as PULocationID,
                loc_do as DOLocationID,
                pax as passenger_count,
                dist as trip_distance,
                flag as store_and_fwd_flag,
                ratecode as RatecodeID,
                trip_type as Trip_type,
                shared as SR_Flag,
                fare as fare_amount,
                tip as tip_amount,
                total as total_amount,
                pay_type as payment_type,
                extra, mta as mta_tax, tolls as tolls_amount,
                imp_sur as improvement_surcharge, cong_sur as congestion_surcharge,
                air_fee as Airport_fee, ehail as ehail_fee,
                
                -- Quality Check Logic (Criterion 1)
                CASE
                    WHEN t_pick IS NULL OR t_drop IS NULL THEN 'Missing Timestamps'
                    WHEN t_pick > t_drop THEN 'Negative Duration'
                    WHEN t_pick = t_drop THEN 'Zero Duration' 
                    WHEN t_pick < TIMESTAMP('{START_DATE}') THEN 'Date too old'
                    WHEN t_pick > CURRENT_TIMESTAMP() THEN 'Future Date'
                    WHEN source IN ('YELLOW', 'GREEN') AND (dist IS NULL OR dist <= 0.0) THEN 'Zero Distance'
                    WHEN source IN ('YELLOW', 'GREEN') AND (fare IS NULL OR fare <= 0.01) THEN 'Zero/Low Fare'
                    WHEN dist > 500 THEN 'Impossible Distance > 500'
                    ELSE 'VALID'
                END as row_status

            FROM raw_combined
        )

        -- Deduplizierung anwenden (Criterion 3: "duplicates are completely eliminated")
        SELECT * EXCEPT(rn)
        FROM (
            SELECT 
                *,
                ROW_NUMBER() OVER (PARTITION BY trip_id ORDER BY load_date DESC) as rn
            FROM calculated_trips
        )
        WHERE rn = 1; -- Nimmt nur den neuesten Eintrag pro ID, l√∂scht Duplikate

        -- 3. Canonical Table (Nur Valid & Unique)
        CREATE TABLE `{table_ref}`
        PARTITION BY DATE_TRUNC(pickup_datetime, MONTH)
        CLUSTER BY source_system, vendor_id
        AS SELECT * EXCEPT(row_status) FROM temp_all_trips WHERE row_status = 'VALID';

        -- 4. Error Table (Alles andere, auch Unique)
        CREATE TABLE `{error_table_ref}`
        PARTITION BY DATE(load_date)
        AS SELECT * EXCEPT(row_status), row_status as rejection_reason FROM temp_all_trips WHERE row_status != 'VALID';
    END;
    """

    job_config = bigquery.QueryJobConfig()
    try:
        query_job = client.query(query, location=CORRECT_LOCATION, job_config=job_config)
        query_job.result()
        
        cnt_valid = client.get_table(table_ref).num_rows
        cnt_error = client.get_table(error_table_ref).num_rows
        print(f"‚úÖ ETL Fertig! Alle Task 2a Kriterien erf√ºllt.")
        print(f"   -> Canonical: {cnt_valid} Trips (Valid & Unique)")
        print(f"   -> Rejected:  {cnt_error} Trips (Quality Issues)")
        
    except Exception as e:
        print(f"‚ùå Fehler: {e}")

run_etl_split_logic()

Starte ETL Job (Task 2a Compliance) in Region: EU...
‚úÖ ETL Fertig! Alle Task 2a Kriterien erf√ºllt.
   -> Canonical: 543991014 Trips (Valid & Unique)
   -> Rejected:  214536158 Trips (Quality Issues)


In [37]:
# Zelle 4: Quality Check
print("--- CLEAN DATA SAMPLE ---")
print(client.query(f"SELECT * FROM `{table_ref}` LIMIT 3").to_dataframe().T)

print("\n--- ERROR DATA SAMPLE  ---")
try:
    err_df = client.query(f"SELECT rejection_reason, count(*) as cnt FROM `{error_table_ref}` GROUP BY 1").to_dataframe()
    print(err_df)
except:
    print("Keine Fehler gefunden (Tabelle leer).")

--- CLEAN DATA SAMPLE ---




                                                        0                                 1                                 2
trip_id                              -4421157812598186384               2293086960074000412                559149138261532980
source_system                                         FHV                               FHV                               FHV
load_date                2025-12-30 15:49:50.872862+00:00  2025-12-30 15:49:50.872862+00:00  2025-12-30 15:49:50.872862+00:00
vendor_id                                          B00001                            B00001                            B00001
Affiliated_base_number                             B00001                            B00001                            B00001
dispatching_base_nummer                            B00001                            B00001                            B00001
pickup_datetime                 2019-12-04 12:30:00+00:00         2019-12-16 13:30:00+00:00         2019-12-09 09:22:0



In [40]:
# Zelle 5: Data Quality Check (Stichproben pro System)
import pandas as pd

def check_data_samples():
    print(f"--- üîç DATA INSPECTION (Tabelle: {table_ref}) ---\n")
    
    # 1. YELLOW CHECK ("Cellos")
    # Fokus: Haben sie Ratecode? Sind Finanzen da?
    print("üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):")
    sql_yellow = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'YELLOW'
    LIMIT 15
    """
    try:
        df_y = client.query(sql_yellow).to_dataframe()
        display(df_y) # Oder print(df_y) falls kein Jupyter
    except Exception as e:
        print(e)

    # 2. GREEN CHECK
    # Fokus: Haben sie Trip_type und Ehail_fee? (Spezifisch f√ºr Green)
    print("\nüíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):")
    sql_green = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'GREEN'
    LIMIT 5
    """
    try:
        df_g = client.query(sql_green).to_dataframe()
        display(df_g)
    except Exception as e:
        print(e)

    # 3. FHV CHECK
    # Fokus: Sind die neuen Base-Nummern da? Sind Preise WIRKLICH NULL?
    print("\nüñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):")
    sql_fhv = f"""
    SELECT 
        *
    FROM `{table_ref}`
    WHERE source_system = 'FHV'
    LIMIT 5
    """
    try:
        df_f = client.query(sql_fhv).to_dataframe()
        display(df_f)
    except Exception as e:
        print(e)

    # 4. STATISTIK
    print("\nüìä VERTEILUNG NACH SYSTEM:")
    sql_stats = f"""
    SELECT source_system, COUNT(*) as count 
    FROM `{table_ref}` 
    GROUP BY source_system
    """
    try:
        print(client.query(sql_stats).to_dataframe())
    except:
        pass

check_data_samples()

--- üîç DATA INSPECTION (Tabelle: taxi-bi-project.canonical.canonical_unified_taxi) ---

üöï YELLOW TAXI SAMPLE (Sollte Finanzen & Ratecode haben):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,store_and_fwd_flag,RatecodeID,Trip_type,SR_Flag,fare_amount,tip_amount,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,-4541716367343958441,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-13 22:32:58+00:00,2022-06-13 22:52:27+00:00,132,226,2,13.66,N,5,,False,0.28,0.0,0.58,2,0.0,0.0,0.0,0.3,0.0,0.0,,False
1,-1494165756183860661,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-08 14:11:44+00:00,2022-06-08 14:11:51+00:00,138,138,1,0.03,N,5,,False,0.5,0.0,2.05,2,0.0,0.0,0.0,0.3,0.0,1.25,,False
2,2760980750506435943,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-08 17:46:25+00:00,2022-06-08 17:47:25+00:00,261,261,4,0.18,N,5,,False,0.68,0.0,3.48,2,0.0,0.0,0.0,0.3,2.5,0.0,,False
3,2254533002131040751,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-17 15:04:13+00:00,2022-06-17 15:04:22+00:00,244,244,1,0.08,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
4,-299366019138767966,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-11 15:34:58+00:00,2022-06-11 15:35:14+00:00,116,116,1,0.08,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
5,8821727008390654047,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-25 09:16:59+00:00,2022-06-25 09:17:07+00:00,75,43,1,0.02,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
6,-5221159397971551019,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-06 08:54:43+00:00,2022-06-06 08:55:07+00:00,264,145,1,0.04,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
7,6590085071103482937,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-16 12:38:13+00:00,2022-06-16 12:38:17+00:00,194,194,2,0.03,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
8,6116499101575703046,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-18 12:36:56+00:00,2022-06-18 12:37:37+00:00,74,74,1,0.02,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False
9,-8398583326789544684,YELLOW,2025-12-30 15:49:50.872862+00:00,2,,,2022-06-12 18:30:22+00:00,2022-06-12 18:30:37+00:00,41,41,1,0.01,N,1,,False,2.5,0.0,3.3,2,0.0,0.5,0.0,0.3,0.0,0.0,,False



üíö GREEN TAXI SAMPLE (Sollte Trip_type & ehail_fee haben):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,store_and_fwd_flag,RatecodeID,Trip_type,SR_Flag,fare_amount,tip_amount,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,-9002387679684662587,GREEN,2025-12-30 15:49:50.872862+00:00,2,,,2020-10-02 22:53:09+00:00,2020-10-02 23:20:21+00:00,42,244,1,4.13,N,5,2,False,0.22,0.0,0.52,2,0.0,0.0,0.0,0.3,0.0,0.0,0.0,False
1,-342098132131277727,GREEN,2025-12-30 15:49:50.872862+00:00,2,,,2020-10-28 15:28:58+00:00,2020-10-28 15:29:07+00:00,75,75,1,0.02,N,5,2,False,0.4,0.0,0.7,2,0.0,0.0,0.0,0.3,0.0,0.0,0.0,False
2,-8463771668343842694,GREEN,2025-12-30 15:49:50.872862+00:00,2,,,2020-10-17 02:10:31+00:00,2020-10-17 02:10:55+00:00,244,244,1,0.04,N,5,2,False,0.75,0.0,0.75,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,-6071892078582452105,GREEN,2025-12-30 15:49:50.872862+00:00,2,,,2020-10-12 14:47:31+00:00,2020-10-12 15:28:17+00:00,260,129,1,3.09,N,5,2,False,1.0,0.0,1.3,2,0.0,0.0,0.0,0.3,0.0,0.0,0.0,False
4,-294760539446597371,GREEN,2025-12-30 15:49:50.872862+00:00,2,,,2020-10-11 11:27:10+00:00,2020-10-11 11:42:04+00:00,70,129,1,1.45,N,5,2,False,1.0,0.0,1.3,2,0.0,0.0,0.0,0.3,0.0,0.0,0.0,False



üñ§ FHV SAMPLE (Muss Base-Nummern haben, aber KEINE Preise):




Unnamed: 0,trip_id,source_system,load_date,vendor_id,Affiliated_base_number,dispatching_base_nummer,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,store_and_fwd_flag,RatecodeID,Trip_type,SR_Flag,fare_amount,tip_amount,total_amount,payment_type,extra,mta_tax,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee,ehail_fee,dq_issue_flag
0,7781973870049039549,FHV,2025-12-30 15:49:50.872862+00:00,B01975,B01975,B01975,2016-08-08 10:45:00+00:00,2016-08-08 12:00:00+00:00,,,,,N,99,,False,,,,0,,,,,,,,False
1,-6795874216567223929,FHV,2025-12-30 15:49:50.872862+00:00,B01975,B01975,B01975,2016-08-07 07:40:00+00:00,2016-08-07 10:30:00+00:00,,,,,N,99,,False,,,,0,,,,,,,,False
2,-8571146480725559329,FHV,2025-12-30 15:49:50.872862+00:00,B01975,B01975,B01975,2016-08-09 09:15:00+00:00,2016-08-09 10:30:00+00:00,,,,,N,99,,False,,,,0,,,,,,,,False
3,8192371412234322395,FHV,2025-12-30 15:49:50.872862+00:00,B01975,B01975,B01975,2016-08-10 06:30:00+00:00,2016-08-10 07:15:00+00:00,,,,,N,99,,False,,,,0,,,,,,,,False
4,2634938577144916482,FHV,2025-12-30 15:49:50.872862+00:00,B01975,B01975,B01975,2016-08-04 09:15:00+00:00,2016-08-04 10:00:00+00:00,,,,,N,99,,False,,,,0,,,,,,,,False



üìä VERTEILUNG NACH SYSTEM:
  source_system      count
0        YELLOW  102941825
1           FHV  515372815
2         GREEN   66645691


