In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test_groupby') \
    .config("spark.ui.port", "4042") \
    .getOrCreate()


In [61]:
df_green = spark.read.csv('data/raw/green/*/*', header=True, inferSchema=True)
df_yellow = spark.read.csv('data/raw/yellow/*/*', header=True, inferSchema=True)

In [3]:
df_green.createOrReplaceTempView("green")

In [6]:
df_green_revenue = spark.sql("""
SELECT 
    date_trunc('hour', lpep_pickup_datetime) AS hour, 
    PULocationID AS zone,

    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    green
WHERE
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2  
""")

In [7]:
df_green_revenue.show()

+-------------------+----+------------------+--------------+
|               hour|zone|            amount|number_records|
+-------------------+----+------------------+--------------+
|2020-01-01 00:00:00|  65|            199.49|            10|
|2020-01-01 02:00:00|  17|            200.03|            11|
|2020-01-01 10:00:00|  70|              30.8|             1|
|2020-01-01 11:00:00| 181|             49.64|             5|
|2020-01-02 03:00:00|   7|               9.8|             1|
|2020-01-02 06:00:00|   7|              53.1|             2|
|2020-01-02 06:00:00| 169|            104.09|             3|
|2020-01-02 07:00:00|  42|327.32000000000005|            21|
|2020-01-02 08:00:00| 174|50.309999999999995|             3|
|2020-01-02 10:00:00|  66|             97.67|             5|
|2020-01-02 12:00:00|  26|            280.47|            13|
|2020-01-02 13:00:00|  89|            121.85|             6|
|2020-01-02 14:00:00|  74|1029.3399999999995|            68|
|2020-01-02 16:00:00| 23

In [8]:
df_green_revenue = df_green.filter("lpep_pickup_datetime >= '2020-01-01 00:00:00'") \
    .withColumn("hour", F.date_trunc("hour", "lpep_pickup_datetime")) \
    .groupBy("hour", "PULocationID") \
    .agg({"total_amount": "sum", "*": "count"}) \
    .withColumnRenamed("sum(total_amount)", "amount") \
    .withColumnRenamed("count(1)", "number_records") \
    .orderBy("hour","PULocationID") 



In [75]:
df_green_revenue.write.parquet('data/report/revenue/green', mode="overwrite")

In [66]:
df_yellow.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge']

In [71]:
df_yellow_revenue = df_yellow.filter("tpep_pickup_datetime >= '2020-01-01 00:00:00'") \
    .withColumn("hour", F.date_trunc("hour", "tpep_pickup_datetime")) \
    .groupBy("hour", "PULocationID") \
    .agg({"total_amount": "sum", "*": "count"}) \
    .withColumnRenamed("sum(total_amount)", "amount") \
    .withColumnRenamed("count(1)", "number_records") \
    .orderBy("hour","PULocationID") 

In [73]:
df_yellow_revenue.write.parquet('data/report/revenue/yellow', mode="overwrite")

In [77]:
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_number_records')

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_number_records')

In [80]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'PULocationID'], how='outer')

In [81]:
df_join.write.parquet('data/report/revenue/total', mode='overwrite')