In [72]:
import numpy as np
import csv
%matplotlib inline
import pandas as pd
import re

from datetime import datetime

In [91]:
id_to_region = { 12: 1, 88: 1, 87: 1, 209: 1, 45: 1, 231: 1, 261: 1, 13: 1, 158: 2, 249: 2, 113: 2, 114: 2, 79: 2, 4: 2, 232: 2, 148: 2, 144: 2, 211: 2, 125: 2, 246: 3, 50: 3, 48: 3, 68: 3, 90: 3, 186: 3, 100: 3, 230: 3, 163: 3, 161: 3, 164: 3, 234: 3, 107: 3, 170: 3, 162: 3, 229: 3, 233: 3, 137: 3, 224: 3, 143: 4, 142: 4, 239: 4, 238: 4, 151: 4, 24: 4, 75: 5, 236: 5, 263: 5, 262: 5, 140: 5, 141: 5, 237: 5, 166: 6, 41: 6, 74: 6, 42: 6, 152: 6, 116: 6, 244: 7, 120: 7, 243: 7, 127: 7, 128: 7,
}
valid_ids = id_to_region.keys()
columns = ["weekday", "hour", "region", "avg_min_duration", "avg_mean_duration", "avg_max_duration", "avg_pickups", "avg_ride_cost", "count"]

In [144]:
def filter_pickup(data_pickup):
    # Filter all trips that start and end in the above regions
    # Create a copy
    data_pickup_filtered = chunk.copy()

    # Initialize the list to record which region the origin is located
    in_which_region_list = []

    # Loop through each row
    for i in range(0, len(data_pickup)):
        in_which_region = -1 # Initialize with -1
        if data_pickup["PULocationID"][i] in valid_ids and data_pickup["PULocationID"][i] == data_pickup["DOLocationID"][i]:
            in_which_region = id_to_region[data_pickup["PULocationID"][i]]
        in_which_region_list.append(in_which_region)
    data_pickup_filtered['region'] = in_which_region_list
    # Keep only those have real region indice
    data_pickup_filtered = data_pickup_filtered[data_pickup_filtered.region != -1] 
    # Reset the indice
    data_pickup_filtered = data_pickup_filtered.dropna(how='any').reset_index(drop=True)
    return data_pickup_filtered


def process_datetime(data_datetime):     
    weekday_list = []
    hour_list = []
    duration_list = []
    count = [1] * len(data_datetime)
    
    for i in range(len(data_datetime)):
        start = pd.to_datetime(data_datetime["tpep_pickup_datetime"][i])
        end = pd.to_datetime(data_datetime["tpep_dropoff_datetime"][i])
        weekday_list.append(start.weekday())
        hour_list.append(start.hour)
        duration_list.append(int((end-start).total_seconds()))

    data_datetime["weekday"] = weekday_list
    data_datetime["hour"] = hour_list
    data_datetime["duration"] = duration_list
    data_datetime["count"] = count
    return data_datetime


def compute_average(to_compute):
    df = to_compute.copy()
    # Remove unuseful columns
    df = df.drop(
        [
            "PULocationID", 
            "DOLocationID", 
            "tpep_pickup_datetime", 
            "tpep_dropoff_datetime", 
            "VendorID",
            "passenger_count",
            "RatecodeID",
            "store_and_fwd_flag",
            "payment_type",
            "fare_amount",
            "extra",
            "mta_tax",
            "tip_amount",
            "tolls_amount",
            "improvement_surcharge",
        ],
        1
    )
    # groupby
    df = df.groupby(["weekday","hour","region"], as_index=False).agg(
        {"count": "count", "total_amount": "mean", "duration": ["min", "mean", "max"]}
    )
    
    return df


def process_pipeline(df, chunk):
    chunk = filter_pickup(chunk)
    chunk = process_datetime(chunk)
    chunk = compute_average(chunk)
    # combine
    df = chunk
    return df

In [145]:
chunksize = 10 ** 5
filename = "data/2018_Yellow_Taxi_Trip_Data_small.csv"
current_size = 0
df = pd.DataFrame(columns=columns)

for chunk in pd.read_csv(filename, chunksize=chunksize):
    current_size += len(chunk)
    df = process_pipeline(df, chunk)
    
df.head(10)

Unnamed: 0_level_0,weekday,hour,region,count,total_amount,duration,duration,duration
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,min,mean,max
0,2,22,5,1,6.96,232,232.0,232
1,3,5,3,1,3.3,10,10.0,10
2,3,5,4,1,4.8,92,92.0,92
3,3,5,6,1,3.8,71,71.0,71
4,3,6,1,3,6.533333,128,226.0,330
5,3,6,2,2,38.015,11,85.5,160
6,3,6,3,33,9.01303,3,197.393939,424
7,3,6,4,5,27.412,5,67.2,175
8,3,6,5,29,7.972759,35,188.965517,497
9,3,6,6,6,5.75,93,246.5,475
