## Download Data

In [1]:
import kagglehub

# Download latest version
train_csv = kagglehub.dataset_download("neomatrix369/nyc-taxi-trip-duration-extended", path='train/train.csv')
train_extended_csv = kagglehub.dataset_download("neomatrix369/nyc-taxi-trip-duration-extended", path='train_extended.csv')

## Basic Assertions

In [2]:
x = 5

# Pass
assert x == 5

# Fails
assert x == 6, 'x is not equal to 6'

AssertionError: x is not equal to 6

In [3]:
# Lists
my_list = [1, 2, 3, 4, 6]
assert all(my_list[i] == i + 1 for i in range(len(my_list))), 'one item is too large'

AssertionError: one item is too large

In [4]:
import csv

# Validate expected columns
with open(train_extended_csv) as file:
  reader = csv.DictReader(file)
  expected_columns = ['column1', 'column2']
  assert reader.fieldnames == expected_columns, f'Expected columns: {expected_columns}, but got: {reader.fieldnames}'

AssertionError: Expected columns: ['column1', 'column2'], but got: ['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'trip_duration', 'pickup_district', 'pickup_neighbourhood', 'dropoff_district', 'dropoff_neighbourhood', 'pickup_geonumber', 'dropoff_geonumber', 'day_period', 'month', 'year', 'season', 'weekday_or_weekend', 'regular_day_or_holiday', 'pickup_hour', 'day_name', 'financial_quarter']

In [5]:
# Validate passenger count
with open(train_extended_csv) as file:
  reader = csv.DictReader(file)
  for row in reader:
    assert int(row['passenger_count']) > 0, f"Invalid passenger count: {row['passenger_count']}"

AssertionError: Invalid passenger count: 0

In [6]:
%%time

# Count trips without passengers
trips_without_passenger = 0
with open(train_extended_csv) as file:
  reader = csv.DictReader(file)
  for row in reader:
    if int(row['passenger_count']) == 0:
      trips_without_passenger += 1

print('Trips without passenger', trips_without_passenger)

Trips without passenger 60
CPU times: user 3.8 s, sys: 67.9 ms, total: 3.86 s
Wall time: 3.88 s


In [7]:
%%time

# Count trips without passengers using Pandas
import pandas as pd

trips = pd.read_csv(train_extended_csv)
trips_without_passenger = trips[trips['passenger_count'] == 0]
print('Trips without passenger', len(trips_without_passenger))

Trips without passenger 60
CPU times: user 2.19 s, sys: 187 ms, total: 2.38 s
Wall time: 2.41 s


## Index Equality

In [8]:
import pandas.testing as pdt

index1 = pd.Index([1, 2, 3])
index2 = pd.Index([1, 2, 4])
pdt.assert_index_equal(index1, index2)

AssertionError: Index are different

Index values are different (33.33333 %)
[left]:  Index([1, 2, 3], dtype='int64')
[right]: Index([1, 2, 4], dtype='int64')

In [9]:
index1 = pd.Index([1, 2, 3])
index2 = pd.Index([3, 2, 1])
pdt.assert_index_equal(index1, index2)

AssertionError: Index are different

Index values are different (66.66667 %)
[left]:  Index([1, 2, 3], dtype='int64')
[right]: Index([3, 2, 1], dtype='int64')

In [10]:
index1 = pd.Index([1, 2, 3])
index2 = pd.Index([3, 2, 1])
pdt.assert_index_equal(index1, index2, check_order=False)
print('Data is valid')

Data is valid


In [11]:
index1 = pd.Index([1.0, 2.0, 3.0])
index2 = pd.Index([1.01, 2.02, 3.03])
pdt.assert_index_equal(index1, index2)

AssertionError: Index are different

Index values are different (100.0 %)
[left]:  Index([1.0, 2.0, 3.0], dtype='float64')
[right]: Index([1.01, 2.02, 3.03], dtype='float64')

In [12]:
index1 = pd.Index([1.0, 2.0, 3.0])
index2 = pd.Index([1.01, 2.02, 3.03])
pdt.assert_index_equal(index1, index2, check_exact=False, atol=0.1)
print('Data is valid')

Data is valid


In [13]:
data = pd.read_csv(train_csv, parse_dates=['pickup_datetime', 'dropoff_datetime'], nrows=1000)

index1 = pd.Index(data['pickup_datetime'].dt.date)
index2 = pd.Index(data['dropoff_datetime'].dt.date)
pdt.assert_index_equal(index1, index2, check_names=False)

AssertionError: Index are different

Index values are different (0.6 %)
[left]:  Index([2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30,
       2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10,
       ...
       2016-01-12, 2016-02-17, 2016-06-16, 2016-04-11, 2016-05-25, 2016-05-20,
       2016-01-26, 2016-03-17, 2016-02-24, 2016-02-13],
      dtype='object', name='pickup_datetime', length=1000)
[right]: Index([2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30,
       2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10,
       ...
       2016-01-12, 2016-02-17, 2016-06-16, 2016-04-11, 2016-05-25, 2016-05-20,
       2016-01-26, 2016-03-17, 2016-02-24, 2016-02-13],
      dtype='object', name='dropoff_datetime', length=1000)

In [14]:
(index1 != index2).sum()

np.int64(6)

In [15]:
data[index1 != index2][['pickup_datetime', 'dropoff_datetime']]

Unnamed: 0,pickup_datetime,dropoff_datetime
86,2016-04-06 23:42:26,2016-04-07 00:12:08
531,2016-02-20 04:03:06,2016-02-21 03:33:00
607,2016-03-02 23:41:50,2016-03-03 00:10:43
615,2016-05-29 23:55:03,2016-05-30 00:00:21
722,2016-03-05 23:39:44,2016-03-06 00:05:47
871,2016-03-11 23:57:09,2016-03-12 00:08:30


In [16]:
# Clean up the data
data = data[index1 == index2].reset_index(drop=True)

# Now the assertion is valid
index1 = pd.Index(data['pickup_datetime'].dt.date)
index2 = pd.Index(data['dropoff_datetime'].dt.date)
pdt.assert_index_equal(index1, index2, check_names=False)
print('Data is valid')

Data is valid


## Series Equality

In [17]:
series1 = pd.Series([1, 2, 3], name='series_1')
series2 = pd.Series([1, 2, 3], name='series_2')
pdt.assert_series_equal(series1, series2)

AssertionError: Series are different

Attribute "name" are different
[left]:  series_1
[right]: series_2

In [18]:
pdt.assert_series_equal(series1, series2, check_names=False)
print('Series are equal')

Series are equal


In [19]:
series1 = pd.Series([1, 2, 3], name='series_1')
series2 = pd.Series(['1', '2', '3'], name='series_1')
pdt.assert_series_equal(series1, series2)

AssertionError: Attributes of Series are different

Attribute "dtype" are different
[left]:  int64
[right]: object

In [20]:
# Ignoring data types won't work as it would ignore series types only,
# but items would still have different data types
pdt.assert_series_equal(series1, series2, check_dtype=False)
print('Series are equal')

AssertionError: Series are different

Series values are different (100.0 %)
[index]: [0, 1, 2]
[left]:  [1, 2, 3]
[right]: [1, 2, 3]

In [21]:
data = pd.read_csv(train_csv, parse_dates=['pickup_datetime', 'dropoff_datetime'], nrows=1000)

series1 = data['pickup_datetime'].dt.date
series2 = data['dropoff_datetime'].dt.date
pdt.assert_series_equal(series1, series2, check_names=False)

AssertionError: Series are different

Series values are different (0.6 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30, 2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10, 2016-05-10, 2016-05-15, 2016-02-19, 2016-06-01, 2016-05-27, 2016-05-16, 2016-04-11, 2016-04-14, 2016-06-27, 2016-06-05, 2016-02-28, 2016-04-01, 2016-04-09, 2016-06-25, 2016-06-03, 2016-02-14, 2016-02-27, 2016-06-20, 2016-06-13, 2016-03-23, 2016-05-17, 2016-05-21, 2016-05-10, 2016-04-17, 2016-01-09, 2016-03-11, 2016-01-05, 2016-06-23, 2016-05-27, 2016-05-10, 2016-04-21, 2016-04-14, 2016-03-14, 2016-03-19, 2016-02-12, 2016-06-13, 2016-01-03, 2016-02-26, 2016-05-18, 2016-04-10, 2016-01-19, 2016-05-07, 2016-02-15, 2016-04-13, 2016-05-15, 2016-04-19, 2016-01-02, 2016-04-22, 2016-03-14, 2016-03-19, 2016-02-29, 2016-01-07, 2016-05-13, 2016-05-24, 2016-04-10, 2016-05-21, 2016-02-29, 2016-06-25, 2016-03-18, 2016-02-03, 2016-06-07, 2016-03-16, 2016-02-20, 2016-02-20, 2016-03-24, 2016-03-20, 2016-05-16, 2016-06-17, 2016-03-23, 2016-04-12, 2016-06-11, 2016-02-25, 2016-01-07, 2016-06-05, 2016-06-13, 2016-01-16, 2016-04-06, 2016-05-13, 2016-01-28, 2016-06-24, 2016-05-27, 2016-05-22, 2016-05-18, 2016-03-31, 2016-02-13, 2016-01-20, 2016-06-02, 2016-02-04, 2016-03-30, 2016-04-29, ...]
[right]: [2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30, 2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10, 2016-05-10, 2016-05-15, 2016-02-19, 2016-06-01, 2016-05-27, 2016-05-16, 2016-04-11, 2016-04-14, 2016-06-27, 2016-06-05, 2016-02-28, 2016-04-01, 2016-04-09, 2016-06-25, 2016-06-03, 2016-02-14, 2016-02-27, 2016-06-20, 2016-06-13, 2016-03-23, 2016-05-17, 2016-05-21, 2016-05-10, 2016-04-17, 2016-01-09, 2016-03-11, 2016-01-05, 2016-06-23, 2016-05-27, 2016-05-10, 2016-04-21, 2016-04-14, 2016-03-14, 2016-03-19, 2016-02-12, 2016-06-13, 2016-01-03, 2016-02-26, 2016-05-18, 2016-04-10, 2016-01-19, 2016-05-07, 2016-02-15, 2016-04-13, 2016-05-15, 2016-04-19, 2016-01-02, 2016-04-22, 2016-03-14, 2016-03-19, 2016-02-29, 2016-01-07, 2016-05-13, 2016-05-24, 2016-04-10, 2016-05-21, 2016-02-29, 2016-06-25, 2016-03-18, 2016-02-03, 2016-06-07, 2016-03-16, 2016-02-20, 2016-02-20, 2016-03-24, 2016-03-20, 2016-05-16, 2016-06-17, 2016-03-23, 2016-04-12, 2016-06-11, 2016-02-25, 2016-01-07, 2016-06-05, 2016-06-13, 2016-01-16, 2016-04-07, 2016-05-13, 2016-01-28, 2016-06-24, 2016-05-27, 2016-05-22, 2016-05-18, 2016-03-31, 2016-02-13, 2016-01-20, 2016-06-02, 2016-02-04, 2016-03-30, 2016-04-29, ...]
At positional index 86, first diff: 2016-04-06 != 2016-04-07

In [22]:
(series1 != series2).sum()

np.int64(6)

In [23]:
# Clean up the data
data = data[series1 == series2].reset_index(drop=True)

# Now the assertion is valid
series1 = data['pickup_datetime'].dt.date
series2 = data['dropoff_datetime'].dt.date
pdt.assert_series_equal(series1, series2, check_names=False)
print('Series are equal')

Series are equal


## DataFrame Equality

In [24]:
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [1, 2], 'B': [4, 4]})
pdt.assert_frame_equal(df1, df2)

AssertionError: DataFrame.iloc[:, 1] (column name="B") are different

DataFrame.iloc[:, 1] (column name="B") values are different (50.0 %)
[index]: [0, 1]
[left]:  [3, 4]
[right]: [4, 4]

In [25]:
df1 = pd.DataFrame({'B': [3, 3, 3], 'A': [1, 1, 1]})
df2 = pd.DataFrame({'A': [1, 1, 1], 'B': [3, 3, 3]})
pdt.assert_frame_equal(df1, df2)

AssertionError: DataFrame.columns are different

DataFrame.columns values are different (100.0 %)
[left]:  Index(['B', 'A'], dtype='object')
[right]: Index(['A', 'B'], dtype='object')
At positional index 0, first diff: B != A

In [26]:
pdt.assert_frame_equal(df1, df2, check_like=True)
print('Data frames are equivalent')

Data frames are equivalent


In [27]:
data = pd.read_csv(train_csv, parse_dates=['pickup_datetime', 'dropoff_datetime'], nrows=1000)

df1 = data.copy()
df1['date'] = df1['pickup_datetime'].dt.date
df1[['pickup_datetime', 'date']]

Unnamed: 0,pickup_datetime,date
0,2016-03-14 17:24:55,2016-03-14
1,2016-06-12 00:43:35,2016-06-12
2,2016-01-19 11:35:24,2016-01-19
3,2016-04-06 19:32:31,2016-04-06
4,2016-03-26 13:30:55,2016-03-26
...,...,...
995,2016-05-20 10:53:52,2016-05-20
996,2016-01-26 02:01:09,2016-01-26
997,2016-03-17 11:51:09,2016-03-17
998,2016-02-24 16:25:29,2016-02-24


In [28]:
df2 = data.copy()
df2['date'] = df2['dropoff_datetime'].dt.date
df2[['dropoff_datetime', 'date']]

Unnamed: 0,dropoff_datetime,date
0,2016-03-14 17:32:30,2016-03-14
1,2016-06-12 00:54:38,2016-06-12
2,2016-01-19 12:10:48,2016-01-19
3,2016-04-06 19:39:40,2016-04-06
4,2016-03-26 13:38:10,2016-03-26
...,...,...
995,2016-05-20 11:15:35,2016-05-20
996,2016-01-26 02:24:31,2016-01-26
997,2016-03-17 12:09:37,2016-03-17
998,2016-02-24 16:30:04,2016-02-24


In [29]:
pdt.assert_frame_equal(df1, df2, check_names=False)

AssertionError: DataFrame.iloc[:, 11] (column name="date") are different

DataFrame.iloc[:, 11] (column name="date") values are different (0.6 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30, 2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10, 2016-05-10, 2016-05-15, 2016-02-19, 2016-06-01, 2016-05-27, 2016-05-16, 2016-04-11, 2016-04-14, 2016-06-27, 2016-06-05, 2016-02-28, 2016-04-01, 2016-04-09, 2016-06-25, 2016-06-03, 2016-02-14, 2016-02-27, 2016-06-20, 2016-06-13, 2016-03-23, 2016-05-17, 2016-05-21, 2016-05-10, 2016-04-17, 2016-01-09, 2016-03-11, 2016-01-05, 2016-06-23, 2016-05-27, 2016-05-10, 2016-04-21, 2016-04-14, 2016-03-14, 2016-03-19, 2016-02-12, 2016-06-13, 2016-01-03, 2016-02-26, 2016-05-18, 2016-04-10, 2016-01-19, 2016-05-07, 2016-02-15, 2016-04-13, 2016-05-15, 2016-04-19, 2016-01-02, 2016-04-22, 2016-03-14, 2016-03-19, 2016-02-29, 2016-01-07, 2016-05-13, 2016-05-24, 2016-04-10, 2016-05-21, 2016-02-29, 2016-06-25, 2016-03-18, 2016-02-03, 2016-06-07, 2016-03-16, 2016-02-20, 2016-02-20, 2016-03-24, 2016-03-20, 2016-05-16, 2016-06-17, 2016-03-23, 2016-04-12, 2016-06-11, 2016-02-25, 2016-01-07, 2016-06-05, 2016-06-13, 2016-01-16, 2016-04-06, 2016-05-13, 2016-01-28, 2016-06-24, 2016-05-27, 2016-05-22, 2016-05-18, 2016-03-31, 2016-02-13, 2016-01-20, 2016-06-02, 2016-02-04, 2016-03-30, 2016-04-29, ...]
[right]: [2016-03-14, 2016-06-12, 2016-01-19, 2016-04-06, 2016-03-26, 2016-01-30, 2016-06-17, 2016-05-21, 2016-05-27, 2016-03-10, 2016-05-10, 2016-05-15, 2016-02-19, 2016-06-01, 2016-05-27, 2016-05-16, 2016-04-11, 2016-04-14, 2016-06-27, 2016-06-05, 2016-02-28, 2016-04-01, 2016-04-09, 2016-06-25, 2016-06-03, 2016-02-14, 2016-02-27, 2016-06-20, 2016-06-13, 2016-03-23, 2016-05-17, 2016-05-21, 2016-05-10, 2016-04-17, 2016-01-09, 2016-03-11, 2016-01-05, 2016-06-23, 2016-05-27, 2016-05-10, 2016-04-21, 2016-04-14, 2016-03-14, 2016-03-19, 2016-02-12, 2016-06-13, 2016-01-03, 2016-02-26, 2016-05-18, 2016-04-10, 2016-01-19, 2016-05-07, 2016-02-15, 2016-04-13, 2016-05-15, 2016-04-19, 2016-01-02, 2016-04-22, 2016-03-14, 2016-03-19, 2016-02-29, 2016-01-07, 2016-05-13, 2016-05-24, 2016-04-10, 2016-05-21, 2016-02-29, 2016-06-25, 2016-03-18, 2016-02-03, 2016-06-07, 2016-03-16, 2016-02-20, 2016-02-20, 2016-03-24, 2016-03-20, 2016-05-16, 2016-06-17, 2016-03-23, 2016-04-12, 2016-06-11, 2016-02-25, 2016-01-07, 2016-06-05, 2016-06-13, 2016-01-16, 2016-04-07, 2016-05-13, 2016-01-28, 2016-06-24, 2016-05-27, 2016-05-22, 2016-05-18, 2016-03-31, 2016-02-13, 2016-01-20, 2016-06-02, 2016-02-04, 2016-03-30, 2016-04-29, ...]
At positional index 86, first diff: 2016-04-06 != 2016-04-07

In [30]:
(df1['date'] != df2['date']).sum()

np.int64(6)

In [31]:
# Clean up the data
clean_rows = df1['date'] == df2['date']
df1 = df1[clean_rows].reset_index(drop=True)
df2 = df2[clean_rows].reset_index(drop=True)

# Now the assertion is valid
pdt.assert_frame_equal(df1, df2, check_names=False)
print('Data frames are equal')

Data frames are equal


## Numpy Arrays Equality

In [32]:
import numpy as np
import numpy.testing as npt

a = np.array([1, 2, 3])
b = np.array([1, 2, 4])
npt.assert_array_equal(a, b)

AssertionError: 
Arrays are not equal

Mismatched elements: 1 / 3 (33.3%)
Max absolute difference among violations: 1
Max relative difference among violations: 0.25
 ACTUAL: array([1, 2, 3])
 DESIRED: array([1, 2, 4])

In [33]:
npt.assert_string_equal('hello world', 'HELLO WORLD')

AssertionError: Differences in strings:
- hello world+ HELLO WORLD

In [34]:
a = np.array([1.0, 2.0, 3.0])
b = np.array([1.01, 2.02, 3.03])
npt.assert_allclose(a, b, rtol=0.01)
print('Arrays are close')

Arrays are close


In [35]:
a = np.array([1, 2, 3])
b = np.array([5, 5, 5])
npt.assert_array_less(a, b)
print("All elements in 'b'' are less than those in 'a'")

All elements in 'b'' are less than those in 'a'


In [36]:
npt.assert_array_less(b, a)

AssertionError: 
Arrays are not strictly ordered `x < y`

Mismatched elements: 3 / 3 (100%)
Max absolute difference among violations: 4
Max relative difference among violations: 4.
 x: array([5, 5, 5])
 y: array([1, 2, 3])

## Quantitative Data Tests

In [37]:
import pandas as pd
import numpy as np
import math

def test_for_missing_data(df):
  assert df.isnull().sum().sum() == 0, 'Data contains missing values'
  return True

def test_for_numeric_data_types(df, columns):
  for col in columns:
    assert df[col].dtype == 'int64' or df[col].dtype == 'float64', f'{col} has non-numerical data type'
  return True

def test_for_out_of_range_values(df, columns):
  for col in columns:
    assert df[col].max() < math.inf, f'{col} contains infinite values'
    assert df[col].min() > -math.inf, f'{col} contains infinite values'
    assert not np.isnan(df[col]).any(), f'{col} contains NaN values'
    assert not np.isinf(df[col]).any(), f'{col} contains infinite values'
  return True

data = pd.read_csv(train_extended_csv)
numeric_columns = ['passenger_count', 'pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude', 'trip_duration',
                  'pickup_geonumber', 'dropoff_geonumber', 'pickup_hour']

assert test_for_missing_data(data)
assert test_for_numeric_data_types(data, numeric_columns)
assert test_for_out_of_range_values(data, numeric_columns)

print('Data is valid')

Data is valid


## Logical Data Tests

In [38]:
import pandas.testing as pdt

def trips_with_same_pickup_and_dropoff(data):
  pickup_data = data[['pickup_longitude', 'pickup_latitude']]
  dropoff_data = data[['dropoff_longitude', 'dropoff_latitude']]
  pickup_data.columns = ['col1', 'col2']
  dropoff_data.columns = ['col1', 'col2']
  return data[(pickup_data == dropoff_data).all(axis=1)][['id']]

def test_for_logical_errors(df):
  assert all(df['dropoff_datetime'] > df['pickup_datetime']), 'Pickup time is before dropoff time'
  assert (df['trip_duration'] >= 0).all(), 'Negative trip duration detected'
  # These assertions won't pass
  # assert len(trips_with_same_pickup_and_dropoff(df)) == 0, f'There are trips with the same pickup and dropoff locations'
  # assert all(df['passenger_count'] > 0), 'There are trips without passengers'
  return True

data = pd.read_csv(train_extended_csv)
assert test_for_logical_errors(data)
print('Data is valid')

Data is valid
