### Prepare data

In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

df_fhv = spark.read \
    .option("header", "true") \
    .csv('./data/raw/yellow/')

In [None]:
df_fhv.show(5)

In [None]:
df_fhv.schema

In [None]:
import pandas as pd

pd.read_csv("./data/raw/fhv/fhv_tripdata_2019-10.csv.gz", nrows=100).dtypes

In [None]:
from pyspark.sql.types import (
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

schema = StructType([
    StructField('dispatching_base_num', StringType(), True),
    StructField('pickup_datetime', TimestampType(), True),
    StructField('dropOff_datetime', TimestampType(), True),
    StructField('PULocationID', IntegerType(), True),
    StructField('DOLocationID', IntegerType(), True),
    StructField('SR_Flag', StringType(), True),
    StructField('Affiliated_base_number', StringType(), True),
])

In [None]:
df_fhv = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('./data/raw/fhv/')

df_fhv.printSchema()

In [None]:
df_fhv.head(5)

### HW questions

In [None]:
# Q1
spark.version

In [None]:
#Q2
df_fhv = df_fhv.repartition(6)
df_fhv.write.parquet('./data/parquet/fhv/')

In [None]:
!ls -lh ./data/parquet/fhv/

In [None]:
#Q3
from pyspark.sql import functions as F

(
    df_fhv.filter(
        F.to_date(F.col("pickup_datetime")) == "2019-10-15")
    # .orderBy("pickup_datetime", ascending=False)
).count()

In [None]:
(
    df_fhv
        .withColumn("trip_duration", 
                    (F.col("dropOff_datetime").cast("long") - F.col("pickup_datetime").cast("long")) / 3600
                )
        .select("pickup_datetime", "dropOff_datetime", "trip_duration")
        .orderBy("trip_duration", ascending=False)
).show(3)