In [0]:
import getpass
username = getpass.getuser()

In [0]:
spark.read.parquet(f'/user/{username}/retail_db_parquet/orders').show()

In [0]:
spark.read.parquet(f'/user/{username}/retail_db_parquet/orders').dtypes

In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
# This will run faster as data will not be read to infer the schema
# Fail to convert order_id as well as order_customer_id as int
spark.read.schema(schema).parquet(f'/user/{username}/retail_db_parquet/orders').show()

In [0]:
schema = """
    order_id BIGINT,
    order_date TIMESTAMP,
    order_customer_id BIGINT,
    order_status STRING
"""

In [0]:
# Fail to type cast order_date to timestamp. In the files, it is represented as string
spark.read.schema(schema).parquet(f'/user/{username}/retail_db_parquet/orders').show()

In [0]:
schema = """
    order_id BIGINT,
    order_date STRING,
    order_customer_id BIGINT,
    order_status STRING
"""

In [0]:
spark.read.parquet(f'/user/{username}/retail_db_parquet/orders', schema=schema).show()

In [0]:
spark.read.schema(schema).parquet(f'/user/{username}/retail_db_parquet/orders').show()

In [0]:
from pyspark.sql.types import StructType, StructField, LongType, StringType

In [0]:
schema = StructType([
    StructField('order_id', LongType()),
    StructField('order_date', StringType()),
    StructField('order_customer_id', LongType()),
    StructField('order_status', StringType())
])

In [0]:
spark.read.schema(schema).parquet(f'/user/{username}/retail_db_parquet/orders').show()

In [0]:
orders = spark.read.schema(schema).parquet(f'/user/{username}/retail_db_parquet/orders')

In [0]:
orders.show(truncate=False)

In [0]:
from pyspark.sql.functions import col

In [0]:
# We can type cast fields such as order_date using cast function
orders. \
    withColumn('order_date', col('order_date').cast('timestamp')). \
    dtypes

In [0]:
orders. \
    withColumn('order_date', col('order_date').cast('timestamp')). \
    show()