In [1]:
from dask.distributed import Client

client = Client(n_workers=8)


In [2]:
import csv
import os
import dask
import dask.dataframe as dd
import dask.array as da
import fastparquet
import pandas as pd
import altair as alt
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from time import time

In [67]:
df = dd.read_csv(os.path.join('data', 'yellow_tripdata_2010-*.csv'),
                 parse_dates=['pickup_datetime','dropoff_datetime'],
                 quoting=csv.QUOTE_NONE, encoding='utf-8', error_bad_lines=False,
                 dtype={'trip_distance':'float64', 'store_and_fwd_flag':'object'})

In [4]:
# https://towardsdatascience.com/heres-how-to-calculate-distance-between-2-geolocations-in-python-93ecab5bbba4
def haversine_distance(row):
    
    lat1 = row['pickup_latitude']
    lon1 = row['pickup_longitude']
    lat2 = row['dropoff_latitude']
    lon2 = row['dropoff_longitude']
    
    # https://stackoverflow.com/questions/19252588/how-do-i-test-for-null-list-entry-in-python-list
    if not all(x for x in [lat1, lon1, lat2, lon2]):
        return row['trip_distance']
    
    if not all(isinstance(x, float) for x in [lat1, lon1, lat2, lon2]):
        return row['trip_distance']
    
    if len([*filter(lambda x: (x < 39.0) | (x > 42.0) , [lat1, lat2])]) > 0:
        return row['trip_distance']
    
    if len([*filter(lambda x: (x < -77.0) | (x > -70.0) , [lon1, lon2])]) > 0:
        return row['trip_distance']
    
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))    
    
    
    if (row['trip_distance'] < .15) | (row['trip_distance'] > 80):
        res = res
    else:
        res = row['trip_distance']   
    
    
    return np.round(res, 2)

In [5]:
def make_trip_distance_bins(row):
    
    dist = row['comp_trip_distance']
    
    if dist == 0:
        res = 0
    elif dist <= .25:
        res = .25
    elif dist <= .5:
        res = .5
    elif dist <= 1:
        res = 1
    elif dist <= 2:
        res = 2
    elif dist <= 4:
        res = 4
    elif dist <= 8:
        res = 8
    elif dist <= 16:
        res = 16
    elif dist <= 32:
        res = 32
    else:
        res = 33
    
    
    return res

In [6]:
def make_post_treatment_time_dummy(row):
    
    if (dt.time(9, 0, 0) <= row.pickup_time < dt.time(12, 0, 0)):
        res = 0
    elif (dt.time(12, 0, 0) <= row.pickup_time < dt.time(15, 0, 0)):
        res = 1    
    else:
        res = -1
    return res

In [7]:
df.isnull().sum().compute()

vendor_id                 0
pickup_datetime           0
dropoff_datetime          0
passenger_count           0
trip_distance             0
pickup_longitude          0
pickup_latitude           0
rate_code                 0
store_and_fwd_flag    91028
dropoff_longitude         0
dropoff_latitude          0
payment_type              0
fare_amount               0
surcharge                 0
mta_tax                   0
tip_amount                0
tolls_amount              0
total_amount              0
dtype: int64

In [10]:
df.dtypes

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                      int64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [68]:
lstToDrop = ['vendor_id', 'dropoff_datetime', 'passenger_count', 'payment_type', 'rate_code', 'store_and_fwd_flag', 'fare_amount', 'surcharge' ,'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount']

ddf = df.drop(lstToDrop, axis=1)
ddf['pickup_date'] = ddf['pickup_datetime'].dt.date
ddf['pickup_time'] = ddf['pickup_datetime'].dt.time

# https://stackoverflow.com/questions/62460171/pandas-between-time-equivalent-for-dask-dataframe
#ddf['time_str'] = dd.to_datetime(ddf["pickup_datetime"].dt.time.astype(str))

ddf['comp_trip_distance'] = ddf.apply(lambda row: haversine_distance(row), axis=1, meta=(None, 'float64'))

ddf['comp_dist_bins'] = ddf.apply(lambda row: make_trip_distance_bins(row), axis=1, meta=(None, 'float64'))

lstToDrop = ['pickup_datetime', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'comp_trip_distance']

ddf = ddf.drop(lstToDrop, axis=1)

ddf['post_treatment_time_dummy'] = ddf.apply(lambda row: make_post_treatment_time_dummy(row), axis=1, meta=(None, 'int64'))
ddf = ddf[(ddf.post_treatment_time_dummy > -1)]

ddf = ddf.groupby(['pickup_date', 'comp_dist_bins', 'post_treatment_time_dummy']).count().reset_index()
ddf.columns = ['pickup_date', 'comp_dist_bins', 'post_treatment_time_dummy', 'RidesCount']

ddf['pickup_date'] = dd.to_datetime(ddf['pickup_date'])

# monday = 0, sunday=6
ddf['pickup_day_of_week'] = ddf['pickup_date'].dt.weekday




In [69]:
print("Start Time =", dt.datetime.now().strftime("%H:%M:%S"))

t1 = time()
ddf = ddf.compute()
ddf = ddf.join(pd.get_dummies(ddf['pickup_day_of_week']))
# https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
ddf.rename(columns={0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}, inplace=True)


ddf = ddf[ddf['pickup_date'].dt.year == 2010]

ddf.to_csv('data/afternoon_treatment_dist_bins_2010_dow.csv', index=False )
t2 = time()
elapsed_s = (t2 - t1)

print("Stop Time =", dt.datetime.now().strftime("%H:%M:%S"))
print('Elapsed time is %f seconds.' % elapsed)

Start Time = 12:39:22
Stop Time = 13:57:55
Elapsed time is 0.144816 seconds.


Stop Time = 12:33:56
Elapsed time is 0.144816 minutes.


In [151]:
ddf.dtypes

pickup_date                  datetime64[ns]
comp_dist_bins                      float64
post_treatment_time_dummy             int64
Count                                 int64
dtype: object

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e5c38169-dd81-4fdb-9628-d1699354129b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>