In [None]:
import os
import sys
from typing import List, Optional
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask import delayed, compute
import matplotlib.pyplot as plt
import glob

BASE_DIR = "/d/hpc/projects/FRI/bigdata/students/in7357/optimized_parquet"
DATASETS = ["FHVHV"]
CLEANED_BASE = "/d/hpc/projects/FRI/bigdata/students/in7357/cleaned_parquet"
QUARANTINE_BASE = "/d/hpc/projects/FRI/bigdata/students/in7357/quarantine_parquet"
OUT_DIR = "/d/hpc/projects/FRI/bigdata/students/in7357/dq_results_task2"
os.makedirs(CLEANED_BASE, exist_ok=True)
os.makedirs(QUARANTINE_BASE, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

MIN_YEAR = 2019
MAX_YEAR = 2026
MAX_SUSPECT_SPEED_MPH = 120.0
MAX_TRIP_HOURS = 24.0
DROP_BAD_ROWS = False

PICKUP_CANDIDATES = [
    "tpep_pickup_datetime", "lpep_pickup_datetime", "pickup_datetime",
    "pickup_time", "pickup_ts", "pickup", "request_datetime", "on_scene_datetime"
]
DROPOFF_CANDIDATES = [
    "tpep_dropoff_datetime", "lpep_dropoff_datetime", "dropoff_datetime",
    "dropoff_time", "dropoff_ts", "dropoff"
]

def detect_col(cols: List[str], candidates: List[str]) -> Optional[str]:
    cols_map = {c.lower(): c for c in cols}
    for cand in candidates:
        if cand.lower() in cols_map:
            return cols_map[cand.lower()]
    for lc, original in cols_map.items():
        if 'pickup' in lc and ('date' in lc or 'time' in lc):
            return original
        if 'dropoff' in lc and ('date' in lc or 'time' in lc):
            return original
    return None

def has_meaningful_values(ddf, col, min_nonnull=1):
    if col is None or col not in ddf.columns:
        return False
    try:
        nonnull_count = int(ddf[col].notna().sum().compute())
        return nonnull_count >= min_nonnull
    except Exception:
        return False

@delayed
def process_file(file_path, dataset):
    df = dd.read_parquet(file_path, engine="pyarrow", infer_divisions=False)
    rename_map = {c: c.lower() for c in df.columns}
    df = df.rename(columns=rename_map)
    pickup_col = detect_col(list(df.columns), PICKUP_CANDIDATES)
    dropoff_col = detect_col(list(df.columns), DROPOFF_CANDIDATES)
    if pickup_col is None:
        df['pickup_datetime'] = dd.from_pandas(pd.Series([pd.NaT] * len(df.index.compute())), npartitions=df.npartitions)
        pickup_col = 'pickup_datetime'
    else:
        pickup_col = pickup_col.lower()
    if dropoff_col is None:
        df['dropoff_datetime'] = dd.to_datetime(df.get('dropoff_datetime'), errors='coerce')
        dropoff_col = 'dropoff_datetime'
    else:
        dropoff_col = dropoff_col.lower()
    df[pickup_col] = dd.to_datetime(df[pickup_col], errors='coerce', infer_datetime_format=True).astype('datetime64[us]')
    df[dropoff_col] = dd.to_datetime(df[dropoff_col], errors='coerce', infer_datetime_format=True).astype('datetime64[us]')
    if 'year' not in df.columns:
        df['year'] = df[pickup_col].dt.year.astype('Int64')
    df['year'] = dd.to_numeric(df['year'], errors="coerce")
    df['_duration_s'] = (df[dropoff_col] - df[pickup_col]).dt.total_seconds()
    dist_col = next((c for c in ['trip_distance','trip_miles','trip_mile'] if c in df.columns), None)
    pcol = next((c for c in ['passenger_count','passengers'] if c in df.columns), None)
    fare_col = next((c for c in df.columns if 'fare' in c), None)
    total_col = 'total_amount' if 'total_amount' in df.columns else None
    tolls_col = next((c for c in df.columns if 'toll' in c), None)
    df['_is_missing_pickup'] = df[pickup_col].isna()
    df['_year_out_of_range'] = (~df['year'].isna()) & ((df['year'] < MIN_YEAR) | (df['year'] > MAX_YEAR))
    df['_pickup_eq_dropoff'] = (~df[pickup_col].isna()) & (df[dropoff_col] == df[pickup_col])
    df['_dropoff_before_pickup'] = (~df[pickup_col].isna()) & (~df[dropoff_col].isna()) & (df[dropoff_col] < df[pickup_col])
    df['_trip_duration_zero'] = df['_duration_s'] == 0
    df['_trip_duration_negative'] = df['_duration_s'] < 0
    if dist_col and has_meaningful_values(df, dist_col):
        df['_trip_distance_zero'] = dd.to_numeric(df[dist_col], errors='coerce').fillna(0) == 0
        hours = df['_duration_s'] / 3600.0
        with np.errstate(divide='ignore', invalid='ignore'):
            speed = dd.to_numeric(df[dist_col], errors='coerce') / hours
        df['_suspicious_speed'] = (hours > 0) & (speed > MAX_SUSPECT_SPEED_MPH)
    else:
        df['_trip_distance_zero'] = False
        df['_suspicious_speed'] = False
    if pcol and has_meaningful_values(df, pcol):
        pc_num = dd.to_numeric(df[pcol], errors='coerce')
        df['_passenger_count_invalid'] = pc_num.isna() | (pc_num < 0) | (pc_num > 10)
    else:
        df['_passenger_count_invalid'] = False
    if fare_col and has_meaningful_values(df, fare_col):
        df['_negative_fare'] = dd.to_numeric(df[fare_col], errors='coerce') < 0
    else:
        df['_negative_fare'] = False
    if tolls_col and has_meaningful_values(df, tolls_col):
        df['_negative_tolls'] = dd.to_numeric(df[tolls_col], errors='coerce') < 0
    else:
        df['_negative_tolls'] = False
    if total_col and has_meaningful_values(df, total_col):
        df['_negative_total'] = dd.to_numeric(df[total_col], errors='coerce') < 0
    else:
        df['_negative_total'] = False
    issue_mask = (
        df['_is_missing_pickup'] | df['_year_out_of_range'] | df['_pickup_eq_dropoff'] |
        df['_dropoff_before_pickup'] | df['_trip_distance_zero'] | df['_trip_duration_zero'] |
        df['_trip_duration_negative'] | df['_passenger_count_invalid'] | df['_negative_fare'] |
        df['_negative_tolls'] | df['_negative_total'] | df['_suspicious_speed']
    )
    agg_cols = [
        '_is_missing_pickup','_year_out_of_range','_pickup_eq_dropoff','_dropoff_before_pickup',
        '_trip_distance_zero','_trip_duration_zero','_trip_duration_negative',
        '_passenger_count_invalid','_negative_fare','_negative_tolls','_negative_total','_suspicious_speed'
    ]
    grouped = df.groupby('year')[agg_cols].sum().compute()
    total_by_year = df.groupby('year').size().compute().rename('total_rows')
    summary = grouped.join(total_by_year).reset_index().rename(columns={'year':'pickup_year'})
    for col in agg_cols:
        summary[col+"_pct"] = (summary[col] / summary['total_rows']).fillna(0) * 100.0
    return summary, df, issue_mask

for dataset in DATASETS:
    src_dir = os.path.join(BASE_DIR, dataset)
    files = glob.glob(os.path.join(src_dir, "*.parquet"))
    delayed_results = [process_file(f, dataset) for f in files]
    computed = compute(*delayed_results)
    all_summaries, dfs, masks = zip(*computed)
    combined_summary = pd.concat(all_summaries, ignore_index=True)
    csv_out = os.path.join(OUT_DIR, f"{dataset}_dq_summary_by_year.csv")
    combined_summary.to_csv(csv_out, index=False)
    summary_sorted = combined_summary.sort_values('pickup_year')
    pct_cols = [c for c in summary_sorted.columns if c.endswith('_pct') and c.startswith('_')]
    summary_sorted['year_label'] = summary_sorted['pickup_year'].apply(lambda x: 'unknown' if pd.isna(x) else str(int(x)))
    fig, ax = plt.subplots(figsize=(12,6))
    bottom = np.zeros(len(summary_sorted))
    x = np.arange(len(summary_sorted))
    for c in pct_cols:
        vals = summary_sorted[c].to_numpy()
        ax.bar(x, vals, bottom=bottom, label=c.replace('_pct','').lstrip('_'))
        bottom += vals
    ax.set_xticks(x)
    ax.set_xticklabels(summary_sorted['year_label'], rotation=45)
    ax.set_ylabel("Percent of rows with issue (%)")
    ax.set_title(f"{dataset} — Data Quality issues by pickup year")
    ax.legend(bbox_to_anchor=(1.02,1), loc='upper left')
    plt.tight_layout()
    png_out = os.path.join(OUT_DIR, f"{dataset}_dq_summary_by_year.png")
    fig.savefig(png_out, dpi=150)
    plt.close(fig)
    combined_df = dd.concat(dfs)
    combined_mask = dd.concat(masks)
    clean_df_to_write = combined_df[~combined_mask]
    bad_df_to_write = combined_df[combined_mask]
    cleaned_out = os.path.join(CLEANED_BASE, dataset)
    quarantine_out = os.path.join(QUARANTINE_BASE, dataset)
    os.makedirs(cleaned_out, exist_ok=True)
    os.makedirs(quarantine_out, exist_ok=True)
    clean_df_to_write.to_parquet(cleaned_out, engine="pyarrow", write_index=False, partition_on=['year'], compression="snappy", overwrite=True)
    if not DROP_BAD_ROWS:
        bad_df_to_write.to_parquet(quarantine_out, engine="pyarrow", write_index=False, partition_on=['year'], compression="snappy", overwrite=True)
