In [1]:
import yellowcab
from yellowcab.io import get_data_path, read_all_files, filter_borough, add_duration, add_location, add_weekend, add_namings, add_datetime_columns
import os
import datetime as dt

In [82]:
yellowcab.io.create_clean_trip_dataset(save=True)

Reading the raw data
Filtering the borough
Adding duration for filtering
Filtering the extreme values
Augmenting the raw data



  geo_data["longitude"] = gdf.centroid.x

  geo_data["latitude"] = gdf.centroid.y


Trips with location_id above 263 were dropped, since there is no matching geo data.
Resetting index and saving
Done. File is at:D:\OneDrive - Universität zu Köln\Uni\PDS\Code\notebooks\1_exploration\..\..\data\input\trip_data\clean_data.parquet


In [3]:
df = yellowcab.io.read_all_files("parquet")

In [2]:
def create_clean_trip_dataset(soft_duration_cutoff=13000, borough='Brooklyn', save=False, ret=False):
    # load data
    print('Reading the raw data')
    df = read_all_files('parquet', raw=True)

    # filter borough
    length = len(df)
    df = filter_borough(df, borough)
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering borough')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    # add duration
    print('Adding duration for filtering')
    df = add_duration(df)

    # filter negative and extrem values
    print('Filtering the extreme values')
    df = df[df['tpep_pickup_datetime'].between('2020-01-01 00:00:00', '2020-12-31 23:59:59')]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering pickup in 2020')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['tpep_dropoff_datetime'].between('2020-01-01 00:00:00', '2020-12-31 23:59:59')]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering dropoff in 2020')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['passenger_count'] > 0]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering passengers')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['trip_distance'].between(0.01, 1000)]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering distance')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['fare_amount'].between(0.01, 7000)]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering fare')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['tip_amount'] >= 0]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering tip')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['tolls_amount'] >= 0]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering tolls')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['total_amount'].between(0, 7000)]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering total')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['congestion_surcharge'] >= 0]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering congestion')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    df = df[df['duration'].between(1, 57600)]  # cut off at 16h
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering duration hard')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)


    midnight = dt.datetime(2020, 1, 1, hour=0, minute=0, second=0)
    df = df[((df['duration'] < soft_duration_cutoff) |
             ((df['tpep_pickup_datetime'].dt.time != midnight.time()) &
             (df['tpep_dropoff_datetime'].dt.time != midnight.time())))]
    diff = length - len(df)
    print('Was ' + str(length) + ' long before filtering duration soft')
    print('Filtered ' + str(diff) + ' values out, or ' + str((diff / length) * 100))
    print('')
    length = len(df)

    # add columns
    print('Augmenting the raw data')
    df = add_datetime_columns(df)
    df = add_weekend(df)
    df = add_location(df)
    df = add_namings(df)

    print('Resetting index and saving')
    df.reset_index(inplace=True, drop=True)
    if save:
        df.to_parquet(os.path.join(get_data_path(), "input", "trip_data", "clean_data.parquet"))
        print('Done. File is at:' + os.path.join(get_data_path(), "input", "trip_data", "clean_data.parquet"))

    if ret:
        return df
    else:
        return None

create_clean_trip_dataset()

Reading the raw data
Was 23838931 long before filtering borough
Filtered 16218357 values out, or 68.03307161717947

Adding duration for filtering
Filtering the extreme values
Was 7620574 long before filtering pickup in 2020
Filtered 92 values out, or 0.001207258140922193

Was 7620482 long before filtering dropoff in 2020
Filtered 77 values out, or 0.0010104347730235436

Was 7620405 long before filtering passengers
Filtered 154638 values out, or 2.0292622242518608

Was 7465767 long before filtering distance
Filtered 91196 values out, or 1.2215221825165452

Was 7374571 long before filtering fare
Filtered 29047 values out, or 0.3938805389493165

Was 7345524 long before filtering tip
Filtered 0 values out, or 0.0

Was 7345524 long before filtering tolls
Filtered 0 values out, or 0.0

Was 7345524 long before filtering total
Filtered 0 values out, or 0.0

Was 7345524 long before filtering congestion
Filtered 0 values out, or 0.0

Was 7345524 long before filtering duration hard
Filtered 16068


  geo_data["longitude"] = gdf.centroid.x

  geo_data["latitude"] = gdf.centroid.y


Trips with location_id above 263 were dropped, since there is no matching geo data.
Resetting index and saving
