In [21]:
import pyspark
from pyspark.sql import SparkSession
from datetime import datetime
from collections import namedtuple

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [22]:
df_green = spark.read.parquet('data/pq/green/*/*')

In [23]:
rdd = df_green \
    .select('lpep_pickup_datetime', 'PULocationID', 'total_amount') \
    .rdd

In [24]:
df_green.show(vertical=True)

-RECORD 0------------------------------------
 VendorID              | 2                   
 lpep_pickup_datetime  | 2020-01-23 13:10:15 
 lpep_dropoff_datetime | 2020-01-23 13:38:16 
 store_and_fwd_flag    | N                   
 RatecodeID            | 1                   
 PULocationID          | 74                  
 DOLocationID          | 130                 
 passenger_count       | 1                   
 trip_distance         | 12.77               
 fare_amount           | 36.0                
 extra                 | 0.0                 
 mta_tax               | 0.5                 
 tip_amount            | 2.05                
 tolls_amount          | 6.12                
 ehail_fee             | null                
 improvement_surcharge | 0.3                 
 total_amount          | 44.97               
 payment_type          | 1                   
 trip_type             | 1                   
 congestion_surcharge  | 0.0                 
-RECORD 1-------------------------

In [25]:
start = datetime(year=2020, month=1, day=1)

def filter_outliers(row):
    return row.lpep_pickup_datetime >= start

def prepare_for_grouping(row): 
    hour = row.lpep_pickup_datetime.replace(minute=0, second=0, microsecond=0)
    zone = row.PULocationID
    key = (hour, zone)
    
    amount = row.total_amount
    count = 1
    value = (amount, count)

    return (key, value)

def calculate_revenue(left_value, right_value):
    left_amount, left_count = left_value
    right_amount, right_count = right_value
    
    output_amount = left_amount + right_amount
    output_count = left_count + right_count
    
    return (output_amount, output_count)

In [26]:
RevenueRow = namedtuple('RevenueRow', ['hour', 'zone', 'revenue', 'count'])

In [27]:
from pyspark.sql import types

def unwrap(row):
    return RevenueRow(
        hour=row[0][0], 
        zone=row[0][1],
        revenue=row[1][0],
        count=row[1][1]
    )

In [28]:
result_schema = types.StructType([
    types.StructField('hour', types.TimestampType(), True),
    types.StructField('zone', types.IntegerType(), True),
    types.StructField('revenue', types.DoubleType(), True),
    types.StructField('count', types.IntegerType(), True)
])