### Import Libraries

In [8]:
import os
import shutil
from pyspark.sql import types
from pyspark.sql import SparkSession

### Start Spark Session

In [10]:
spark = SparkSession.builder.master("local[*]").appName('test').getOrCreate()

In [25]:
# Check Spark Version.
spark.version

'3.3.2'

### Repartition the June 2021 HVFHV Data into 12 partitions and save it to Parquet. What is the average size of the Parquet Files?

In [27]:
# Download the FHVHV 2021-06 data from here: (https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz).

if not os.path.exists('./data/raw/fhvhv_tripdata_2021-06.csv.gz'):
    os.makedirs('./data/raw')

    # Download data.
    !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz

    # Move data.
    shutil.move('./fhvhv_tripdata_2021-06.csv.gz', './data/raw/')

--2023-03-06 00:06:02--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e-a6da-4923-ad6f-35ff02446a51?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230306%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230306T000602Z&X-Amz-Expires=300&X-Amz-Signature=4085d8fe71201e29d3816a221ba1e2986106a121be44e402a9115a6f459a793e&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhvhv_tripdata_2021-06.csv.gz&response-content-type=application%2Foctet-stream [following]
--2023-03-06 00:06:02--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e

In [29]:
# Read dataframe.
df = spark.read.option("header", "true").csv('./data/raw/fhvhv_tripdata_2021-06.csv.gz')

In [30]:
# Check schema.
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [31]:
# Check schema.
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [32]:
# Define schema.
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True),
])

In [34]:
# Read dataframe with schema defined.
df = spark.read.option("header", "true").schema(schema).csv('./data/raw/fhvhv_tripdata_2021-06.csv.gz')

In [35]:
# Check schema.
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [36]:
# Repartition.
df = df.repartition(12)

In [37]:
# Write partitions to parquet files.
df.write.parquet('./data/processed/fhvhv/2021/06/')

                                                                                