In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

Start by running the cell bellow to create a Spark DataFrame with complex data types.

In [0]:
# Define the schema
schema = StructType([
    StructField("order_id", IntegerType(), False),
    StructField("customer_name", StringType(), False),
    StructField("items", ArrayType(StructType([
        StructField("product", StringType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("unit_price", FloatType(), False)
    ])), False),
    StructField("delivery", StructType([
        StructField("city", StringType(), False),
        StructField("postal_code", StringType(), False),
        StructField("status", StringType(), False)
    ])),
    StructField("ratings", MapType(StringType(), IntegerType()), False)
])

# Create the DataFrame
data = [
    (101, "John Doe", [{"product": "Laptop", "quantity": 1, "unit_price": 1200.0},
                       {"product": "Mouse", "quantity": 2, "unit_price": 25.0}],
     {"city": "New York", "postal_code": "10001", "status": "Delivered"},
     {"service": 5, "product_quality": 3}),

    (102, "Jane Smith", [{"product": "Monitor", "quantity": 2, "unit_price": 300.0},
                         {"product": "Keyboard", "quantity": 1, "unit_price": 50.0}],
     {"city": "San Francisco", "postal_code": "94105", "status": "Shipped"},
     {"service": 4, "product_quality": 4}),

    (103, "Sam Brown", [{"product": "Headphones", "quantity": 3, "unit_price": 80.0}],
     {"city": "Los Angeles", "postal_code": "90001", "status": "Pending"},
     {"service": 3, "product_quality": 3}),
    (104, "Alicia Jones", [{"product": "Mobile Phone", "quantity": 1, "unit_price": 780.0}],
     {"city": "New York", "postal_code": "10003", "status": "Shipped"},
     {"service": 4, "product_quality": 4}),
    (105, "Michael Mayer", [{"product": "Charging Cable", "quantity": 2, "unit_price": 7.5},
                            {"product": "Webcam", "quantity": 1, "unit_price": 170.0},
                            {"product": "Headphones", "quantity": 1, "unit_price": 80.0}],
     {"city": "New York", "postal_code": "10114", "status": "Delivered"},
     {"service": 5, "product_quality": 4}),
]

df = spark.createDataFrame(data, schema)

df.display()

In [0]:
df.printSchema()

This has one ArrayType column, one StructType column and one MapType column. Each element in the ArrayType column is a StructType.

Use the functions we've seen in the lesson to answer the questions. You may also have to use some other functions that we haven't seen in the lesson. You should be able to find all the information you need in the [Spark Functions API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html).

1. Extract the delivery city for each order.

In [0]:
(
    df
    .select(
        'order_id',
        f.col('delivery').getField('city').alias('delivery_city')
    )
).display()

2. Get the name of the first product in each order.

In [0]:
(
    df
    .select(
        'order_id',
        f.element_at('items', 1).getField("product").alias('first_product')
    )
).display()

3. Collapse the `items`, `delivery` and `ratings` columns into a single Struct column called `order_info`.

This new struct column should have one struct field for each original column. The keys of the struct fields should be the original column names.

In [0]:
(
    df
    .select(
        'order_id',
        'customer_name',
        f.struct('items', 'delivery', 'ratings').alias('order_info')
    )
).display()

4. Create a new column with the service rating for each order. Call this new column `service_rating`.

In [0]:
(
  df
  .select(
    'order_id',
    f.col('ratings').getItem('product_quality').alias('service_rating')
  )
).display()

5. Create a new boolean column called `perfect_service` that is True when the service rating is 5 and False otherwise

In [0]:
(
    df.withColumn('perfect_service', f.col('ratings').getItem('service') == 5)
).display()

**BONUS: This question is a bit harder to answer!**

6. Increase the product quality rating by 1 for each order.

This change should be reflected on a new column, with the exact same content as the `ratings` column, except that the product quality rating is increased by 1.

In [0]:
(
    df.select(
        'order_id',
        f.transform_values(
            "ratings",
            lambda k, v: f.when(k == "product_quality", v + 1).otherwise(v)
        ).alias('updated_ratings')
    )
).display()