In [12]:
import pyspark
from pyspark.sql import SparkSession, types
import pandas as pd

In [2]:
spark = SparkSession.builder.master("local[*]").appName('test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/28 19:49:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_green = spark.read \
    .option("header","true") \
    .csv('data/raw/green/2021/01/')

                                                                                

In [5]:
df_green.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)



In [8]:
# we've loaded the data into a dataframe, but it has no schema, everything is in a string.
# A way to solve this is to use pandas to read the file and use the inferred schema from pandas

df_green_pd = pd.read_csv('data/raw/green/2021/01/green_tripdata_2021-01.csv.gz', nrows=1000)

df_green_pd.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1,43,151,1,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2,1,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1,166,239,1,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1,1,2.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1,41,42,1,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1,1,0.0
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1,168,75,1,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2,1,0.0
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2,265,265,3,0.0,-52.0,0.0,-0.5,0.0,0.0,,-0.3,-52.8,3,1,0.0


In [10]:
# Let's turn this pandas dataframe into a spark dataframe and get the schema from it. 
spark.createDataFrame(df_green_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('lpep_pickup_datetime', StringType(), True), StructField('lpep_dropoff_datetime', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', LongType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('ehail_fee', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('payment_type', LongType(), True), StructField('trip_type', LongType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [16]:
# Take that output into a vscode file and edit it to get it in the right format, as well as using the "types" library


green_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True),
    types.StructField('lpep_pickup_datetime', types.TimestampType(), True),
    types.StructField('lpep_dropoff_datetime', types.TimestampType(), True),
    types.StructField('store_and_fwd_flag', types.StringType(), True),
    types.StructField('RatecodeID', types.IntegerType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('passenger_count', types.IntegerType(), True),
    types.StructField('trip_distance', types.DoubleType(), True),
    types.StructField('fare_amount', types.DoubleType(), True),
    types.StructField('extra', types.DoubleType(), True),
    types.StructField('mta_tax', types.DoubleType(), True),
    types.StructField('tip_amount', types.DoubleType(), True),
    types.StructField('tolls_amount', types.DoubleType(), True),
    types.StructField('ehail_fee', types.DoubleType(), True),
    types.StructField('improvement_surcharge', types.DoubleType(), True),
    types.StructField('total_amount', types.DoubleType(), True),
    types.StructField('payment_type', types.IntegerType(), True),
    types.StructField('trip_type', types.IntegerType(), True),
    types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [17]:
# now let's read the spark dataframe again but using a schema

df_green = spark.read \
    .option("header","true") \
    .schema(green_schema) \
    .csv('data/raw/green/2021/01/')

In [18]:
df_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [19]:
# Let's do the same thing for yellow data

df_yellow_pd = pd.read_csv('data/raw/yellow/2021/01/yellow_tripdata_2021-01.csv.gz', nrows=1000)

In [20]:
spark.createDataFrame(df_yellow_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', StringType(), True), StructField('tpep_dropoff_datetime', StringType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [21]:
# now that it's edited in vscode and the correct types added, let's save it

yellow_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True), 
    types.StructField('tpep_pickup_datetime', types.TimestampType(), True), 
    types.StructField('tpep_dropoff_datetime', types.TimestampType(), True), 
    types.StructField('passenger_count', types.IntegerType(), True), 
    types.StructField('trip_distance', types.DoubleType(), True), 
    types.StructField('RatecodeID', types.IntegerType(), True), 
    types.StructField('store_and_fwd_flag', types.StringType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('payment_type', types.IntegerType(), True), 
    types.StructField('fare_amount', types.DoubleType(), True), 
    types.StructField('extra', types.DoubleType(), True), 
    types.StructField('mta_tax', types.DoubleType(), True), 
    types.StructField('tip_amount', types.DoubleType(), True), 
    types.StructField('tolls_amount', types.DoubleType(), True), 
    types.StructField('improvement_surcharge', types.DoubleType(), True), 
    types.StructField('total_amount', types.DoubleType(), True), 
    types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [23]:
df_yellow = spark.read \
            .option("header", "true") \
            .schema(yellow_schema) \
            .csv('data/raw/yellow/2021/01/')


In [25]:
# Now let's work in spark, starting with green

year = 2020
taxi_type = 'green'

for month in range (1,13):
    print(f'processing data for {taxi_type} {year}/{month}')
    input_path = f'data/raw/{taxi_type}/{year}/{month:02d}/'
    output_path = f'data/pq/{taxi_type}/{year}/{month:02d}/'

    # Remember to run the prepared green schema
    df_green = spark.read \
        .option("header","true") \
        .schema(green_schema) \
        .csv(input_path)
    
    # Let's repartition to 4 partitions to take advantage of the 4 cores on the machine
    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for green 2020/1


                                                                                

processing data for green 2020/2


                                                                                

processing data for green 2020/3


                                                                                

processing data for green 2020/4
processing data for green 2020/5
processing data for green 2020/6
processing data for green 2020/7
processing data for green 2020/8
processing data for green 2020/9
processing data for green 2020/10
processing data for green 2020/11
processing data for green 2020/12


In [30]:
year = 2021
taxi_type = 'yellow'

for month in range (1,13):
    print(f'processing data for {taxi_type} {year}/{month}')
    input_path = f'data/raw/{taxi_type}/{year}/{month:02d}/'
    output_path = f'data/pq/{taxi_type}/{year}/{month:02d}/'

    # Remember to run the prepared yellow schema
    df_green = spark.read \
        .option("header","true") \
        .schema(yellow_schema) \
        .csv(input_path)
    
    # Let's repartition to 4 partitions to take advantage of the 4 cores on the machine
    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for yellow 2021/1


                                                                                

processing data for yellow 2021/2


                                                                                

processing data for yellow 2021/3


                                                                                

processing data for yellow 2021/4


                                                                                

processing data for yellow 2021/5


                                                                                

processing data for yellow 2021/6


                                                                                

processing data for yellow 2021/7


[Stage 116:>                                                        (0 + 4) / 4]

processing data for yellow 2021/8


                                                                                

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/leo/leo_data_engineering/05-batch/code/data/raw/yellow/2021/08.