In [1]:
# Read dataset
import pandas as pd
df = pd.read_csv("porto.csv")

In [4]:
df.head(10)

In [None]:
# EDA: TRIP_ID validation and total trips

# Total number of rows (trips)
total_rows = len(df)

# TRIP_ID: expect 19-digit numeric IDs
trip_id_str = df['TRIP_ID'].astype('string')
trip_id_is_19_digits = trip_id_str.str.fullmatch(r'\d{19}')
valid_19 = int(trip_id_is_19_digits.sum())
invalid_19 = int((~trip_id_is_19_digits).sum())
unique_trip_ids = df['TRIP_ID'].nunique(dropna=True)

# Duplicate TRIP_ID analysis
duplicate_rows = total_rows - unique_trip_ids
trip_id_counts = df['TRIP_ID'].value_counts(dropna=False)
duplicated_trip_ids = trip_id_counts[trip_id_counts > 1]

print(f"Total trips (rows): {total_rows}")
print("TRIP_ID validation (expecting 19-digit numeric):")
print(f"  Valid 19-digit TRIP_IDs: {valid_19}")
print(f"  Invalid TRIP_IDs (not 19 digits): {invalid_19}")
print(f"  Unique TRIP_IDs: {unique_trip_ids}")
print(f"  Duplicate rows (same TRIP_ID appearing multiple times): {duplicate_rows}")

if duplicated_trip_ids.empty:
    print("No duplicated TRIP_IDs detected.")
else:
    print("Top duplicated TRIP_IDs (count > 1):")
    print(duplicated_trip_ids.head(10))

    # Assess whether duplicates are true copies (identical across other columns) or conflicting
    dups = df[df.duplicated('TRIP_ID', keep=False)].copy()
    grouped = dups.groupby('TRIP_ID', sort=False)

    # Number of unique rows per TRIP_ID when excluding TRIP_ID itself
    unique_rows_per_trip = grouped.apply(lambda g: g.drop(columns=['TRIP_ID']).drop_duplicates().shape[0])

    true_copy_trip_ids = unique_rows_per_trip[unique_rows_per_trip == 1].index
    conflict_trip_ids = unique_rows_per_trip[unique_rows_per_trip > 1].index

    num_trip_ids_true_copies = len(true_copy_trip_ids)
    num_trip_ids_conflicts = len(conflict_trip_ids)

    # Rows that can be safely removed among true copies (keep 1 per TRIP_ID)
    removable_rows_true_copies = int(grouped.size().loc[true_copy_trip_ids].sub(1).sum()) if num_trip_ids_true_copies > 0 else 0

    # Among conflicts, how many have differing TAXI_IDs
    differing_taxi_conflicts = 0
    if num_trip_ids_conflicts > 0:
        differing_taxi_conflicts = int(grouped['TAXI_ID'].nunique().loc[conflict_trip_ids].gt(1).sum())

    print("\nDuplicate TRIP_ID diagnostics:")
    print(f"  TRIP_IDs that are true copies (all other columns identical): {num_trip_ids_true_copies}")
    print(f"  Removable duplicate rows among true copies: {removable_rows_true_copies}")
    print(f"  TRIP_IDs with conflicting data (differences in other columns): {num_trip_ids_conflicts}")
    print(f"    of which have differing TAXI_IDs: {differing_taxi_conflicts}")

    # Show a few examples of conflicting TRIP_IDs with their distinct rows
    if num_trip_ids_conflicts > 0:
        print("\nExamples of conflicting TRIP_IDs (distinct rows shown, up to 3 IDs):")
        for tid in list(conflict_trip_ids)[:3]:
            ex = grouped.get_group(tid).drop(columns=['TRIP_ID']).drop_duplicates()
            print(f"  TRIP_ID={tid} -> {len(ex)} distinct rows:")
            print(ex.head(5))
