In [None]:
#1] a) Create a data frame with today’s date and timestamp
#   b) Display the hours, minutes and seconds from the timestamp

#Solution:

from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, current_timestamp, hour, minute, second, col


# Create a SparkSession
spark = SparkSession.builder.appName("DateTimeOperations").getOrCreate()

# a) Create a DataFrame with today's date and timestamp
df = spark.range(1).withColumn("today_date", current_date()) \
                   .withColumn("current_ts", current_timestamp())

print("--- DataFrame with Date and Timestamp ---")
df.show(truncate=False)
df.printSchema()


# b) Display the hours, minutes, and seconds from the timestamp
time_df = df.withColumn("hour", hour(col("current_ts"))) \
            .withColumn("minute", minute(col("current_ts"))) \
            .withColumn("second", second(col("current_ts")))

print("\n--- DataFrame with Extracted Time Components ---")
time_df.show(truncate=False)
time_df.printSchema()


# Stop the SparkSession
spark.stop()

--- DataFrame with Date and Timestamp ---
+---+----------+--------------------------+
|id |today_date|current_ts                |
+---+----------+--------------------------+
|0  |2025-10-03|2025-10-03 05:42:21.410844|
+---+----------+--------------------------+

root
 |-- id: long (nullable = false)
 |-- today_date: date (nullable = false)
 |-- current_ts: timestamp (nullable = false)


--- DataFrame with Extracted Time Components ---
+---+----------+--------------------------+----+------+------+
|id |today_date|current_ts                |hour|minute|second|
+---+----------+--------------------------+----+------+------+
|0  |2025-10-03|2025-10-03 05:42:25.508282|5   |42    |25    |
+---+----------+--------------------------+----+------+------+

root
 |-- id: long (nullable = false)
 |-- today_date: date (nullable = false)
 |-- current_ts: timestamp (nullable = false)
 |-- hour: integer (nullable = false)
 |-- minute: integer (nullable = false)
 |-- second: integer (nullable = false)



In [None]:
#2 For the following employee data showing name, dept and salary, perform the given operations:

#a) Create a data frame for the above data
#b) Display average salary
#c) Display number of unique departments
#d) Display number of employees with unique salary

#Solution:

# ==============================================================================
# 1. SETUP: Install PySpark and import all necessary modules
# ==============================================================================
!pip install pyspark

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, flatten, explode, concat_ws, current_date, current_timestamp, hour, minute, second, avg, countDistinct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# ==============================================================================
# 2. INITIALIZE SPARK SESSION
# ==============================================================================
spark = SparkSession.builder.appName("AllInOnePySpark").getOrCreate()

print("spark.sparkContext.appName:", spark.sparkContext.appName)
print("-" * 60)


# ==============================================================================
# PROBLEM 1: DataFrame Joins (Employee and Department)
# ==============================================================================
print("\nPROBLEM 1: DataFrame Joins")
# Data Creation
emp_data = [(1, "Alice", 10), (2, "Bob", 20), (3, "Charlie", 10), (4, "David", None), (5, "Eve", 40)]
emp_columns = ["emp_id", "emp_name", "dept_id"]
empDF = spark.createDataFrame(data=emp_data, schema=emp_columns)

dept_data = [(10, "Engineering"), (20, "Marketing"), (30, "Finance")]
dept_columns = ["dept_id", "dept_name"]
deptDF = spark.createDataFrame(data=dept_data, schema=dept_columns)

# a) Left Outer Join
print("\na) Left Outer Join:")
empDF.join(deptDF, on="dept_id", how="left").show()

# b) Full Outer Join
print("\nb) Full Outer Join:")
empDF.join(deptDF, on="dept_id", how="full").show()

# c) Inner Join
print("\nc) Inner Join:")
empDF.join(deptDF, on="dept_id", how="inner").show()
print("-" * 60)


# ==============================================================================
# PROBLEM 2: Operations on Nested Schema DataFrame
# ==============================================================================
print("\nPROBLEM 2: Operations on Nested Schema DataFrame")
# Schema and Data
schema_nested = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('dob', StringType(), True),
     StructField('gender', StringType(), True),
     StructField('expenses', StringType(), True)
])
data_nested = [
    Row(name=Row("James;", "", "Smith"), dob="36636", gender="M", expenses="20000"),
    Row(name=Row("Michael", "Rose", ""), dob="40288", gender="M", expenses="40000"),
    Row(name=Row("Jen", "Mary", "Brown"), dob="", gender="F", expenses="-1")
]
df_nested = spark.createDataFrame(data_nested, schema_nested)

# Transformations
final_df_nested = df_nested.withColumn("expenses", col("expenses").cast(IntegerType())) \
                           .withColumnRenamed("dob", "DateOfBirth") \
                           .withColumn("expense_multiplied", col("expenses") * 5)

print("\nFinal DataFrame after transformations:")
final_df_nested.show(truncate=False)
final_df_nested.printSchema()
print("-" * 60)


# ==============================================================================
# PROBLEM 3: Operations on Nested Array DataFrame
# ==============================================================================
print("\nPROBLEM 3: Operations on Nested Array DataFrame")
# Data and Schema
data_array = [("Alice", [["Math", "90"], ["Science", "85"]]), ("Bob", [["History", "88"], ["English", "92"]])]
schema_array = StructType([StructField("name", StringType()), StructField("subjects", ArrayType(ArrayType(StringType())))])
df_array = spark.createDataFrame(data_array, schema_array)
print("\nInitial Nested Array DataFrame:")
df_array.show(truncate=False)

# a) Flatten Nested Array
flattened_df = df_array.withColumn("flat_subjects", flatten(col("subjects")))
print("\na) Flattened Array:")
flattened_df.show(truncate=False)

# b) Explode Nested Array
print("\nb) Exploded Array:")
df_array.withColumn("exploded_subjects", explode(col("subjects"))).show(truncate=False)

# c) Convert Array to String
print("\nc) Array converted to String:")
flattened_df.withColumn("subjects_string", concat_ws(", ", col("flat_subjects"))).show(truncate=False)
print("-" * 60)


# ==============================================================================
# PROBLEM 4: Date and Timestamp Operations
# ==============================================================================
print("\nPROBLEM 4: Date and Timestamp Operations")
# a) Create a DataFrame with today’s date and timestamp
df_time = spark.range(1).withColumn("today_date", current_date()) \
                       .withColumn("current_ts", current_timestamp())
print("\na) DataFrame with Current Date and Timestamp:")
df_time.show(truncate=False)

# b) Display the hours, minutes and seconds from the timestamp
print("\nb) Extracted Time Components:")
df_time.withColumn("hour", hour(col("current_ts"))) \
       .withColumn("minute", minute(col("current_ts"))) \
       .withColumn("second", second(col("current_ts"))).show()
print("-" * 60)


# ==============================================================================
# PROBLEM 5: DataFrame Aggregations
# ==============================================================================
print("\nPROBLEM 5: DataFrame Aggregations")
# a) Create DataFrame
data_agg = [
    ("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100), ("Jason", "Sales", 9000), ("Alice", "Finance", 3700),
    ("Jenniffer", "Finance", 8900), ("Jenson", "Marketing", 9000)
]
schema_agg = ["name", "department", "salary"]
df_agg = spark.createDataFrame(data_agg, schema_agg)
print("\nInitial DataFrame for Aggregation:")
df_agg.show(5)

# b) Display average salary
print("\nb) Average Salary:")
df_agg.select(avg("salary").alias("average_salary")).show()

# c) Display number of unique departments
print("\nc) Unique Department Count:")
df_agg.select(countDistinct("department").alias("unique_department_count")).show()

# d) Display number of employees with unique salary
print("\nd) Unique Salary Count:")
df_agg.select(countDistinct("salary").alias("unique_salary_count")).show()
print("-" * 60)


# ==============================================================================
# 6. STOP THE SPARK SESSION
# ==============================================================================
spark.stop()
print("\nSpark session stopped.")

spark.sparkContext.appName: AllInOnePySpark
------------------------------------------------------------

PROBLEM 1: DataFrame Joins

a) Left Outer Join:
+-------+------+--------+-----------+
|dept_id|emp_id|emp_name|  dept_name|
+-------+------+--------+-----------+
|     10|     1|   Alice|Engineering|
|     20|     2|     Bob|  Marketing|
|   NULL|     4|   David|       NULL|
|     10|     3| Charlie|Engineering|
|     40|     5|     Eve|       NULL|
+-------+------+--------+-----------+


b) Full Outer Join:
+-------+------+--------+-----------+
|dept_id|emp_id|emp_name|  dept_name|
+-------+------+--------+-----------+
|   NULL|     4|   David|       NULL|
|     10|     1|   Alice|Engineering|
|     10|     3| Charlie|Engineering|
|     20|     2|     Bob|  Marketing|
|     30|  NULL|    NULL|    Finance|
|     40|     5|     Eve|       NULL|
+-------+------+--------+-----------+


c) Inner Join:
+-------+------+--------+-----------+
|dept_id|emp_id|emp_name|  dept_name|
+-------+