In [17]:
# Read dataset
import pandas as pd
df = pd.read_csv("porto.csv")

In [18]:
df.describe()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP
count,1710670.0,364770.0,806579.0,1710670.0,1710670.0
mean,1.388622e+18,24490.363018,30.272381,20000350.0,1388622000.0
std,9180944000000000.0,19624.290043,17.74784,211.2405,9180944.0
min,1.372637e+18,2001.0,1.0,20000000.0,1372637000.0
25%,1.380731e+18,6593.0,15.0,20000170.0,1380731000.0
50%,1.388493e+18,18755.0,27.0,20000340.0,1388493000.0
75%,1.39675e+18,40808.0,49.0,20000520.0,1396750000.0
max,1.404173e+18,63884.0,63.0,20000980.0,1404173000.0


In [19]:
print(df['POLYLINE'].head())
# Count rows where POLYLINE is empty (empty list), None, or NaN

0    [[-8.618643,41.141412],[-8.618499,41.141376],[...
1    [[-8.639847,41.159826],[-8.640351,41.159871],[...
2    [[-8.612964,41.140359],[-8.613378,41.14035],[-...
3    [[-8.574678,41.151951],[-8.574705,41.151942],[...
4    [[-8.645994,41.18049],[-8.645949,41.180517],[-...
Name: POLYLINE, dtype: object


In [20]:
# EDA: TRIP_ID validation and total trips

# Total number of rows (trips)
total_rows = len(df)

# TRIP_ID: expect 19-digit numeric IDs
trip_id_str = df['TRIP_ID'].astype('string')
trip_id_is_19_digits = trip_id_str.str.fullmatch(r'\d{19}')
valid_19 = int(trip_id_is_19_digits.sum())
invalid_19 = int((~trip_id_is_19_digits).sum())
unique_trip_ids = df['TRIP_ID'].nunique(dropna=True)

# Duplicate TRIP_ID analysis
duplicate_rows = total_rows - unique_trip_ids
trip_id_counts = df['TRIP_ID'].value_counts(dropna=False)
duplicated_trip_ids = trip_id_counts[trip_id_counts > 1]

print(f"Total trips (rows): {total_rows}")
print("TRIP_ID validation (expecting 19-digit numeric):")
print(f"  Valid 19-digit TRIP_IDs: {valid_19}")
print(f"  Invalid TRIP_IDs (not 19 digits): {invalid_19}")
print(f"  Unique TRIP_IDs: {unique_trip_ids}")
print(f"  Duplicate rows (same TRIP_ID appearing multiple times): {duplicate_rows}")

if duplicated_trip_ids.empty:
    print("No duplicated TRIP_IDs detected.")
else:
    print("Top duplicated TRIP_IDs (count > 1):")
    print(duplicated_trip_ids.head(10))

    # Assess whether duplicates are true copies (identical across other columns) or conflicting
    dups = df[df.duplicated('TRIP_ID', keep=False)].copy()
    grouped = dups.groupby('TRIP_ID', sort=False)

    # Number of unique rows per TRIP_ID when excluding TRIP_ID itself
    unique_rows_per_trip = grouped.apply(lambda g: g.drop(columns=['TRIP_ID']).drop_duplicates().shape[0])

    true_copy_trip_ids = unique_rows_per_trip[unique_rows_per_trip == 1].index
    conflict_trip_ids = unique_rows_per_trip[unique_rows_per_trip > 1].index

    num_trip_ids_true_copies = len(true_copy_trip_ids)
    num_trip_ids_conflicts = len(conflict_trip_ids)

    # Rows that can be safely removed among true copies (keep 1 per TRIP_ID)
    removable_rows_true_copies = int(grouped.size().loc[true_copy_trip_ids].sub(1).sum()) if num_trip_ids_true_copies > 0 else 0

    # Among conflicts, how many have differing TAXI_IDs
    differing_taxi_conflicts = 0
    if num_trip_ids_conflicts > 0:
        differing_taxi_conflicts = int(grouped['TAXI_ID'].nunique().loc[conflict_trip_ids].gt(1).sum())

    print("\nDuplicate TRIP_ID diagnostics:")
    print(f"  TRIP_IDs that are true copies (all other columns identical): {num_trip_ids_true_copies}")
    print(f"  Removable duplicate rows among true copies: {removable_rows_true_copies}")
    print(f"  TRIP_IDs with conflicting data (differences in other columns): {num_trip_ids_conflicts}")
    print(f"    of which have differing TAXI_IDs: {differing_taxi_conflicts}")

    # Show a few examples of conflicting TRIP_IDs with their distinct rows
    if num_trip_ids_conflicts > 0:
        print("\nExamples of conflicting TRIP_IDs (distinct rows shown, up to 3 IDs):")
        for tid in list(conflict_trip_ids)[:3]:
            ex = grouped.get_group(tid).drop(columns=['TRIP_ID']).drop_duplicates()
            print(f"  TRIP_ID={tid} -> {len(ex)} distinct rows:")
            print(ex.head(5))


Total trips (rows): 1710670
TRIP_ID validation (expecting 19-digit numeric):
  Valid 19-digit TRIP_IDs: 1710670
  Invalid TRIP_IDs (not 19 digits): 0
  Unique TRIP_IDs: 1710589
  Duplicate rows (same TRIP_ID appearing multiple times): 81
Top duplicated TRIP_IDs (count > 1):
TRIP_ID
1397172149620000454    3
1402085470620000527    2
1389002485620000685    2
1393869017620000066    2
1389782974620000562    2
1386942349620000080    2
1378830230620000435    2
1391586783620000484    2
1381765723620000392    2
1403407180620000242    2
Name: count, dtype: int64

Duplicate TRIP_ID diagnostics:
  TRIP_IDs that are true copies (all other columns identical): 2
  Removable duplicate rows among true copies: 2
  TRIP_IDs with conflicting data (differences in other columns): 78
    of which have differing TAXI_IDs: 0

Examples of conflicting TRIP_IDs (distinct rows shown, up to 3 IDs):
  TRIP_ID=1372702836620000080 -> 2 distinct rows:
     CALL_TYPE  ORIGIN_CALL  ORIGIN_STAND   TAXI_ID   TIMESTAMP DAY_

  unique_rows_per_trip = grouped.apply(lambda g: g.drop(columns=['TRIP_ID']).drop_duplicates().shape[0])
