### Run Spark

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

### Download "High Volume For-Hire Vehicle Trip Records".

* https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
! wget https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet

* Since it's PARQUET, I need to convert it to CSV. (, which has not been dealt with in the course video.)
    * It will take some minutes...

In [None]:
import pandas as pd

fname = 'fhvhv_tripdata_2024-01'
fpath_parquet = fname + '.parquet'
fpath_csv = fname + '.csv'

df = pd.read_parquet(fpath_parquet)
df.to_csv(fpath_csv, index=False)

* Number of Rows

In [None]:
! wc -l {fpath_csv}

### Read the File with Spark

* Go to Spark UI, then you'll notice a job is created named `csv at NativeMethodAccessorImpl.java:0`.

In [None]:
df = spark.read \
    .option('header', 'true') \
    .csv(fpath_csv)

* All the Columns are typed in String by default.

In [None]:
# Run each.
df.schema
# df.show(5)
# df.head(5)

* Create a small sample file from the original file.

In [None]:
! head -n 101 {fpath_csv} > head.csv

In [None]:
! head -n 5 head.csv

In [None]:
! wc -l head.csv

In [None]:
import pandas as pd

fpath_head_csv = 'head.csv'
df_pandas = pd.read_csv(fpath_head_csv)

* By the way, Pandas infers each column to integer and float. (Unlike the Spark)

In [None]:
df_pandas.dtypes

### Enforce a Custom Schema using Spark

* Define a custom schema.

In [None]:
from pyspark.sql import types

In [None]:
schema = types.StructType(
    [
        types.StructField('hvfhs_license_num', types.StringType(), True),
        types.StructField('dispatching_base_num', types.StringType(), True),
        types.StructField('originating_base_num', types.StringType(), True),
        types.StructField('request_datetime', types.TimestampType(), True),
        types.StructField('on_scene_datetime', types.TimestampType(), True),
        types.StructField('pickup_datetime', types.TimestampType(), True),
        types.StructField('dropoff_datetime', types.TimestampType(), True),
        types.StructField('PULocationID', types.IntegerType(), True),
        types.StructField('DOLocationID', types.IntegerType(), True),
        types.StructField('trip_miles', types.FloatType(), True),
        types.StructField('trip_time', types.IntegerType(), True),
        types.StructField('base_passenger_fare', types.FloatType(), True),
        types.StructField('tolls', types.FloatType(), True),
        types.StructField('bcf', types.FloatType(), True),
        types.StructField('sales_tax', types.FloatType(), True),
        types.StructField('congestion_surcharge', types.FloatType(), True),
        types.StructField('airport_fee', types.FloatType(), True),
        types.StructField('tips', types.FloatType(), True),
        types.StructField('driver_pay', types.FloatType(), True),
        types.StructField('shared_request_flag', types.StringType(), True),
        types.StructField('shared_match_flag', types.StringType(), True),
        types.StructField('access_a_ride_flag', types.StringType(), True),
        types.StructField('wav_request_flag', types.StringType(), True),
        types.StructField('wav_match_flag', types.StringType(), True)
    ]
)

In [None]:
df = spark.read \
    .option('header', 'true') \
    .schema(schema) \
    .csv(fpath_csv)

* Now, you'll find each column has more proper data types.

In [None]:
df.head(2)

### Save as a Parquet File (using Partitions)

* Repartition doesn't trigger repartitioning yet, it'll be repartitioned when you try to save the data.

In [None]:
df.repartition(24)

* Save data as a parquet file.

In [None]:
df.write.parquet('fhvhv/2024/01/', mode='overwrite')

* Open a terminal and go to `fhvhv/2024/01/`, then you'll find 24 partitions have been created.