### Big data course project
<strong>T2: Invalid data points analysis</strong>

Jovana Videnovic & Haris Kupinic

In [58]:
!hostnamectl

In [59]:
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os

In [None]:
part_data_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")
service_type = "fhvhv"

In [61]:
tables_path = Path("/d/hpc/home/jv8043/BD/project/T2/T2_tables") / service_type
os.makedirs(tables_path, exist_ok=True)

In [62]:
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='64GB')
client = Client(cluster)

In [63]:
client

In [64]:
def check_for_outliers_fhvhv(df, start_date, end_date):
    # map: parallel file / partition processing: compares datetime columns independently for each partition in parallel
    # reduce: group & aggregate : aggregates results across all partitions
    df['year'] = df['pickup_datetime'].dt.year

    # check if dropoff is equal to pickup
    invalid_rows = df[df['dropoff_datetime'] == df['pickup_datetime']] # map
    d_eq_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff is before pickup
    invalid_rows = df[df['dropoff_datetime'] < df['pickup_datetime']] # map
    d_before_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff year is equal to the pickup year
    # tolerate if the difference is 1 day (New Year's Eve to New Year's Day)
    invalid_rows = df[(df['dropoff_datetime'].dt.year != df['pickup_datetime'].dt.year) & 
                      (df['dropoff_datetime'] - df['pickup_datetime'] > pd.Timedelta(days=1))] # map
    d_year_diff_res = invalid_rows.groupby('year').size().compute() # reduce 

    # check how many trips are before start_date
    invalid_rows = df[df['pickup_datetime'] < start_date] # map
    d_start_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['pickup_datetime'] > end_date] # map
    d_pu_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['dropoff_datetime'] > end_date] # map
    d_do_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # number of trips longer than 24 hours
    invalid_rows = df[(df['dropoff_datetime'] - df['pickup_datetime']) > pd.Timedelta(days=1)] # map
    d_long_trip_res = invalid_rows.groupby('year').size().compute() # reduce

    return {
        'dropoff_equals_pickup': d_eq_p_res,
        'dropoff_before_pickup': d_before_p_res,
        'dropoff_year_diff': d_year_diff_res,
        'pickup_before_start_date': d_start_date_res,
        'pickup_after_end_date': d_pu_end_date_res,
        'dropoff_after_end_date': d_do_end_date_res,
        'long_trips': d_long_trip_res
    }

In [65]:
def check_for_outliers(df, start_date, end_date):
    # map: parallel file / partition processing: compares datetime columns independently for each partition in parallel
    # reduce: group & aggregate : aggregates results across all partitions
    df['year'] = df['pickup_datetime'].dt.year

    # check if dropoff is equal to pickup
    invalid_rows = df[df['dropoff_datetime'] == df['pickup_datetime']] # map
    d_eq_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff is before pickup
    invalid_rows = df[df['dropoff_datetime'] < df['pickup_datetime']] # map
    d_before_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff year is equal to the pickup year
    # tolerate if the difference is 1 day (New Year's Eve to New Year's Day)
    invalid_rows = df[(df['dropoff_datetime'].dt.year != df['pickup_datetime'].dt.year) & 
                      (df['dropoff_datetime'] - df['pickup_datetime'] > pd.Timedelta(days=1))] # map
    d_year_diff_res = invalid_rows.groupby('year').size().compute() # reduce 

    # trip distance should be greater than 0
    invalid_rows = df[df['trip_distance'] <= 0] # map
    d_trip_distance_res = invalid_rows.groupby('year').size().compute() # reduce

    # passenger count should be greater than 0
    invalid_rows = df[df['passenger_count'] <= 0] # map
    d_passenger_count_res = invalid_rows.groupby('year').size().compute() # reduce

    # negative trip distance
    invalid_rows = df[df['trip_distance'] < 0] # map
    d_trip_distance_neg_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are before start_date
    invalid_rows = df[df['pickup_datetime'] < start_date] # map
    d_start_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['pickup_datetime'] > end_date] # map
    d_pu_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['dropoff_datetime'] > end_date] # map
    d_do_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # number of trips longer than 24 hours
    invalid_rows = df[(df['dropoff_datetime'] - df['pickup_datetime']) > pd.Timedelta(days=1)] # map
    d_long_trip_res = invalid_rows.groupby('year').size().compute() # reduce

    return {
        'dropoff_equals_pickup': d_eq_p_res,
        'dropoff_before_pickup': d_before_p_res,
        'dropoff_year_diff': d_year_diff_res,
        'trip_distance_zero': d_trip_distance_res,
        'passenger_count_zero': d_passenger_count_res,
        'trip_distance_negative': d_trip_distance_neg_res,
        'pickup_before_start_date': d_start_date_res,
        'pickup_after_end_date': d_pu_end_date_res,
        'dropoff_after_end_date': d_do_end_date_res,
        'long_trips': d_long_trip_res
    }

In [66]:
def check_for_outliers_fhv(df, start_date, end_date):
    # map: parallel file / partition processing: compares datetime columns independently for each partition in parallel
    # reduce: group & aggregate : aggregates results across all partitions
    df['year'] = df['pickup_datetime'].dt.year

    # check if dropoff is equal to pickup
    invalid_rows = df[df['dropoff_datetime'] == df['pickup_datetime']] # map
    d_eq_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff is before pickup
    invalid_rows = df[df['dropoff_datetime'] < df['pickup_datetime']] # map
    d_before_p_res = invalid_rows.groupby('year').size().compute() # reduce

    # check if dropoff year is equal to the pickup year
    # tolerate if the difference is 1 day (New Year's Eve to New Year's Day)
    invalid_rows = df[(df['dropoff_datetime'].dt.year != df['pickup_datetime'].dt.year) & 
                      (df['dropoff_datetime'] - df['pickup_datetime'] > pd.Timedelta(days=1))] # map
    d_year_diff_res = invalid_rows.groupby('year').size().compute() # reduce 

    # check how many trips are before start_date
    invalid_rows = df[df['pickup_datetime'] < start_date] # map
    d_start_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['pickup_datetime'] > end_date] # map
    d_pu_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # check how many trips are after end_date
    invalid_rows = df[df['dropoff_datetime'] > end_date] # map
    d_do_end_date_res = invalid_rows.groupby('year').size().compute() # reduce

    # number of trips longer than 24 hours
    invalid_rows = df[(df['dropoff_datetime'] - df['pickup_datetime']) > pd.Timedelta(days=1)] # map
    d_long_trip_res = invalid_rows.groupby('year').size().compute() # reduce

    return {
        'dropoff_equals_pickup': d_eq_p_res,
        'dropoff_before_pickup': d_before_p_res,
        'dropoff_year_diff': d_year_diff_res,
        'pickup_before_start_date': d_start_date_res,
        'pickup_after_end_date': d_pu_end_date_res,
        'dropoff_after_end_date': d_do_end_date_res,
        'long_trips': d_long_trip_res
    }

In [67]:
start_dates = {
    "yellow": pd.Timestamp("2012-01-01"),
    "green": pd.Timestamp("2014-01-01"),
    "fhv": pd.Timestamp("2015-01-01"),
    "fhvhv": pd.Timestamp("2019-02-01"),
}
end_date = pd.Timestamp("2025-02-01")

In [68]:
df = dd.read_parquet(part_data_path / service_type, engine="pyarrow", assume_missing=True)

if service_type == "fhv":
    result = check_for_outliers_fhv(df, start_date=start_dates[service_type], end_date=end_date)
elif service_type == "fhvhv":
    result = check_for_outliers_fhvhv(df, start_date=start_dates[service_type], end_date=end_date)
else:
    result = check_for_outliers(df, start_date=start_dates[service_type], end_date=end_date)

# for each key in the result, save the DataFrame to a CSV file
for key, value in result.items():
    value_df = value.reset_index(name='count')
    value_df.to_csv(tables_path / f"{key}.csv", index=False)

In [69]:
# Shutdown the Dask client and cluster
client.shutdown()
cluster.close()