In [None]:
#1] For the following data and schema create a dataframe and perform the given operations
#Perform the following operations:
#a) Change the data type of expenses to Integer
#b) Rename dob to DateOfBirth
#c) Create a column that has value expense*5

#Solution:

# Install PySpark
!pip install pyspark

# Import necessary classes and functions
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Define the nested schema
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('dob', StringType(), True),
     StructField('gender', StringType(), True),
     StructField('expenses', StringType(), True)
])

# Prepare the data
# Note: The semicolon in "James;" is included as per the provided data
data = [
    Row(name=Row("James;", "", "Smith"), dob="36636", gender="M", expenses="20000"),
    Row(name=Row("Michael", "Rose", ""), dob="40288", gender="M", expenses="40000"),
    Row(name=Row("Robert", "", "Williams"), dob="42114", gender="M", expenses="10000"),
    Row(name=Row("Maria", "Anne", "Jones"), dob="39192", gender="F", expenses="45000"),
    Row(name=Row("Jen", "Mary", "Brown"), dob="", gender="F", expenses="-1")
]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

print("--- Initial DataFrame and Schema ---")
df.show()
df.printSchema()

# Perform all transformations
final_df = df.withColumn("expenses", col("expenses").cast(IntegerType())) \
             .withColumnRenamed("dob", "DateOfBirth") \
             .withColumn("expense_multiplied", col("expenses") * 5)

print("\n--- Final DataFrame and Schema after Operations ---")
final_df.show(truncate=False)
final_df.printSchema()

# Stop the SparkSession
spark.stop()

--- Initial DataFrame and Schema ---
+--------------------+-----+------+--------+
|                name|  dob|gender|expenses|
+--------------------+-----+------+--------+
|   {James;, , Smith}|36636|     M|   20000|
|   {Michael, Rose, }|40288|     M|   40000|
|{Robert, , Williams}|42114|     M|   10000|
|{Maria, Anne, Jones}|39192|     F|   45000|
|  {Jen, Mary, Brown}|     |     F|      -1|
+--------------------+-----+------+--------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- expenses: string (nullable = true)


--- Final DataFrame and Schema after Operations ---
+--------------------+-----------+------+--------+------------------+
|name                |DateOfBirth|gender|expenses|expense_multiplied|
+--------------------+-----------+------+--------+------------------+
|

In [None]:
#2] Create a data frame with a nested array column. Perform the following operations:
#a) Flatten nested array
#b) Explode nested array
#c) Convert array of string to string column.

#Solution:

# Install PySpark
!pip install pyspark

# Import necessary functions and types
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, flatten, explode, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Create a SparkSession
spark = SparkSession.builder.appName("NestedArrayOperations").getOrCreate()

# Define data with a nested array [["subject", "score"], ...]
data = [
    ("Alice", [["Math", "90"], ["Science", "85"]]),
    ("Bob", [["History", "88"], ["English", "92"]]),
    ("Charlie", [["Art", "95"]])
]

# Define the schema for the nested array
schema = StructType([
    StructField("name", StringType()),
    StructField("subjects", ArrayType(ArrayType(StringType())))
])

# Create the DataFrame
df = spark.createDataFrame(data, schema)

print("--- Initial DataFrame with Nested Array ---")
df.show(truncate=False)
df.printSchema()

# Flatten the 'subjects' column
flattened_df = df.withColumn("flat_subjects", flatten(col("subjects")))

print("--- a) DataFrame with Flattened Array ---")
flattened_df.show(truncate=False)
flattened_df.printSchema()

# Explode the 'subjects' column
exploded_df = df.withColumn("exploded_subjects", explode(col("subjects")))

print("--- b) DataFrame with Exploded Array ---")
exploded_df.show(truncate=False)
exploded_df.printSchema()

# First, flatten the array (using the DataFrame from step a)
# Then, convert the flattened array into a comma-separated string
string_df = flattened_df.withColumn(
    "subjects_string",
    concat_ws(", ", col("flat_subjects"))
)

print("--- c) DataFrame with Array Converted to String ---")
string_df.show(truncate=False)
string_df.printSchema()

# Stop the SparkSession
spark.stop()

--- Initial DataFrame with Nested Array ---
+-------+------------------------------+
|name   |subjects                      |
+-------+------------------------------+
|Alice  |[[Math, 90], [Science, 85]]   |
|Bob    |[[History, 88], [English, 92]]|
|Charlie|[[Art, 95]]                   |
+-------+------------------------------+

root
 |-- name: string (nullable = true)
 |-- subjects: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)

--- a) DataFrame with Flattened Array ---
+-------+------------------------------+--------------------------+
|name   |subjects                      |flat_subjects             |
+-------+------------------------------+--------------------------+
|Alice  |[[Math, 90], [Science, 85]]   |[Math, 90, Science, 85]   |
|Bob    |[[History, 88], [English, 92]]|[History, 88, English, 92]|
|Charlie|[[Art, 95]]                   |[Art, 95]                 |
+-------+----------------------------