# Data quality analysis

In [27]:
import pandas as pd
from scipy.stats import zscore

### Understanding the fields in the yellow taxi data

The important fields

- `tpep_pickup_datetime`: Date time when meter was turned on
- `tpep_dropoff_datetime`: Date time when meter was turned off
- `passenger_count`: number of passengers
- `trip_distance`: distance in miles captured by the meter
- `payment_type`:
  - 1: credit card
  - 2: cash
  - 3: no charge
  - 4: dispute
  - 5: unknown
  - 6: voided trip
- `total_amount`: the total fare charged to customeers

A `total_seconds` column is probably appropiate here to do regressions on later.

### Outlier data for the quantitative data

In [16]:
yellow_taxi_2021_01 = pd.read_parquet('../data/monthly_data/yellow_taxi_2021-01.parquet')

In [32]:
quantitative_fields = ['passenger_count', 'trip_distance', 'fare_amount', 'tip_amount', 'tolls_amount', 'total_amount']

yellow_taxi_2021_01['passenger_count'] = yellow_taxi_2021_01['passenger_count'].astype(float)

In [30]:
def detect_outliers(df, columns, threshold=3):
    """
    Identifies outliers in specified columns based on Z-score.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns (list): List of column names to compute z-scores on.
    - threshold (float): Z-score threshold for detecting outliers (default is 3).

    Returns:
    - pd.DataFrame: A DataFrame containing only rows with outliers, including their z-scores.
    """
    df_copy = df.copy()  # Create a copy to avoid modifying the original dataframe
    
    # Compute z-scores for the specified columns
    zscore_columns = {col: f"{col}_zscore" for col in columns}
    for col in columns:
        df_copy[zscore_columns[col]] = zscore(df_copy[col], nan_policy='omit')  # Compute z-score
    
    # Find rows where any z-score exceeds the threshold
    outlier_mask = df_copy[[f"{col}_zscore" for col in columns]].abs().gt(threshold).any(axis=1)
    
    # Return only the outlier rows
    return df_copy[outlier_mask]

In [33]:
detect_outliers(yellow_taxi_2021_01, quantitative_fields)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,passenger_count_zscore,trip_distance_zscore,fare_amount_zscore,tip_amount_zscore,tolls_amount_zscore,total_amount_zscore
12,1,2021-01-01 00:10:46,2021-01-01 00:32:58,2.0,7.40,1.0,N,138,166,2,...,0.3,33.92,0.0,,0.555270,0.007027,0.960506,-0.738539,3.510516,1.119249
13,2,2021-01-01 00:31:06,2021-01-01 00:38:52,5.0,1.70,1.0,N,142,50,1,...,0.3,14.16,2.5,,3.385911,-0.007443,-0.317239,0.170149,-0.148107,-0.225571
14,2,2021-01-01 00:42:11,2021-01-01 00:44:24,5.0,0.81,1.0,N,50,142,2,...,0.3,8.30,2.5,,3.385911,-0.009703,-0.588276,-0.738539,-0.148107,-0.624389
23,1,2021-01-01 00:37:59,2021-01-01 01:01:37,0.0,3.60,1.0,N,107,163,1,...,0.3,30.80,2.5,,-1.331824,-0.002620,0.379713,3.111835,-0.148107,0.906909
24,1,2021-01-01 00:37:40,2021-01-01 01:04:46,2.0,19.10,1.0,N,132,3,2,...,0.3,59.42,0.0,,0.555270,0.036730,3.090082,-0.738539,3.510516,2.854720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369747,2,2021-01-31 23:31:00,2021-01-31 23:49:00,,7.94,,,78,263,0,...,0.3,32.20,,,,0.008398,0.827311,-0.738539,3.510516,1.002190
1369753,2,2021-01-31 23:56:00,2021-02-01 00:38:00,,19.61,,,136,10,0,...,0.3,68.10,,,,0.038025,3.588016,-0.738539,3.510516,3.445461
1369757,2,2021-01-31 23:19:00,2021-02-01 00:22:00,,23.28,,,155,246,0,...,0.3,57.30,,,,0.047342,2.751673,-0.738539,3.510516,2.710438
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,...,0.3,39.50,,,,0.007103,1.586214,-0.738539,3.510516,1.499011
