### Run Spark

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

### Read Parquet File

In [None]:
input_folderpath = 'fhvhv/2024/01'

df = spark.read.parquet(input_folderpath)

In [None]:
df.printSchema()

### Select

* `select` and `filter` is not executed right away. (**Actions**)
    * There's no executed job in Spark UI.

In [None]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003')

* When you do `show`, it's executed right away. (**Transformations**)
    * There shows a new job in Spark UI.

In [None]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003') \
    .show()

### Transformations vs Actions

1. **Transformation**: Lazy (Not Executed Immediately)
- `select`
- `filter`
- `join`
- `groupBy`

2. **Actions**: Eager (Executed Immediately)
- `show`
- `take`
- `head`
- `write`

### Functions Available in Spark

In [None]:
from pyspark.sql import functions as F

In [None]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

### User-defined Functions

In [None]:
def convert_to_hex(base_num):        
    num = int(base_num[1:])
        
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

from pyspark.sql import types

convert_to_hex_udf = F.udf(convert_to_hex, returnType=types.StringType())

df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', convert_to_hex_udf(df.dispatching_base_num)) \
    .select('pickup_date', 'dropoff_date', 'base_id', 'PULocationID', 'DOLocationID') \
    .show()