In [6]:
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -xzf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [7]:
import os, findspark
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-3.5.1-bin-hadoop3"
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practical5_PySpark").getOrCreate()
print("✅ Spark version:", spark.version)

✅ Spark version: 3.5.1


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, upper, lower, length, avg, sum, array, struct, rank
from pyspark.sql.window import Window
import json
from datetime import datetime, date

In [18]:
#Practical-5: Practical on Spark SQL Joins, Spark SQL Schema, StructType & SQL Functions
#Aim: A) Use of next mentioned operations: Use of Spark SQL Join, Join multiple DataFrames,
#Inner join two tables/DataFrame, Self join, Join tables on multiple columns, Convert case class to a schema,
#Create array of struct column, Flatten nested column


In [9]:
# Spark SQL Join

df1 = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df2 = spark.createDataFrame([(1, "HR"), (2, "Finance")], ["id", "department"])
print("Spark SQL Join:")
df1.join(df2, "id").show()

Spark SQL Join:
+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|   Finance|
+---+-----+----------+



In [11]:
# Join Multiple DataFrames

df3 = spark.createDataFrame([(1, "NY"), (2, "LA")], ["id", "city"])
print("Join Multiple DataFrames:")
df1.join(df2, "id").join(df3, "id").show()

Join Multiple DataFrames:
+---+-----+----------+----+
| id| name|department|city|
+---+-----+----------+----+
|  1|Alice|        HR|  NY|
|  2|  Bob|   Finance|  LA|
+---+-----+----------+----+



In [12]:
# Inner Join

print("Inner Join:")
df1.join(df2, ["id"], "inner").show()

Inner Join:
+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|   Finance|
+---+-----+----------+



In [13]:
#Inner Join
print("Inner Join:")
df1.join(df2, ["id"], "inner").show()

Inner Join:
+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|   Finance|
+---+-----+----------+



In [14]:
 #Join on Multiple Columns

df4 = spark.createDataFrame([(1,"Alice","HR"),(2,"Bob","Finance")], ["id","name","department"])
df5 = spark.createDataFrame([(1,"HR","NY"),(2,"Finance","LA")], ["id","department","city"])
print("Join on Multiple Columns:")
df4.join(df5, ["id","department"]).show()

Join on Multiple Columns:
+---+----------+-----+----+
| id|department| name|city|
+---+----------+-----+----+
|  1|        HR|Alice|  NY|
|  2|   Finance|  Bob|  LA|
+---+----------+-----+----+



In [15]:
# Schema / StructType equivalent

people = spark.createDataFrame([(1,"Alice",30),(2,"Bob",25)], ["id","name","age"])
print("Schema of DataFrame:")
people.printSchema(); people.show()

Schema of DataFrame:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 30|
|  2|  Bob| 25|
+---+-----+---+



In [16]:
# Array of Struct Column

arrayStructDF = df1.withColumn("details", array(struct(col("id"), col("name"))))
print("7Array of Struct Column:")
arrayStructDF.show(truncate=False)

7Array of Struct Column:
+---+-----+------------+
|id |name |details     |
+---+-----+------------+
|1  |Alice|[{1, Alice}]|
|2  |Bob  |[{2, Bob}]  |
+---+-----+------------+



In [17]:
#Flatten Nested Column

nestedDF = arrayStructDF.select(col("details")[0].alias("detail"))
flattenedDF = nestedDF.select("detail.*")
print("Flatten Nested Column:")
flattenedDF.show()

Flatten Nested Column:
+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



In [19]:
#Aim: B) Use of next mentioned functions : Date and Time Functions, String Functions, Array Functions,
#Map Functions, Aggregate Functions, Window Functions, Sort Functions, JSON Functions

In [28]:
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -xzf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [29]:
import os, findspark
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-3.5.1-bin-hadoop3"
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practical5_PartB").getOrCreate()
print("Spark version:", spark.version)


Spark version: 3.5.1


In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, rank
from pyspark.sql.window import Window
import json, builtins  # builtins gives access to normal Python sum/max


In [31]:
#Date and Time Functions

print(" Date and Time Functions:")
print("Current Date:", date.today())
print("Current DateTime:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

 Date and Time Functions:
Current Date: 2025-10-04
Current DateTime: 2025-10-04 15:58:54


In [32]:
#String Functions

df_text = spark.createDataFrame([(1,"Hello World"),(2,"Apache Spark")], ["id","text"])
print("String Functions:")
df_text.select(
    col("text"),
    length(col("text")).alias("length"),
    upper(col("text")).alias("upper_case"),
    lower(col("text")).alias("lower_case")
).show()

String Functions:
+------------+------+------------+------------+
|        text|length|  upper_case|  lower_case|
+------------+------+------------+------------+
| Hello World|    11| HELLO WORLD| hello world|
|Apache Spark|    12|APACHE SPARK|apache spark|
+------------+------+------------+------------+



In [42]:
#Array Functions
arr = [1, 2, 3, 4, 5]
print("Sum:", builtins.sum(arr), "Max:", builtins.max(arr), "Sorted:", sorted(arr))

Sum: 15 Max: 5 Sorted: [1, 2, 3, 4, 5]


In [41]:
#Map Functions

m = {"a": 1, "b": 2, "c": 3}
m["d"] = 4
print("Keys:", list(m.keys()), "Values:", list(m.values()), "Updated Map:", m)

Keys: ['a', 'b', 'c', 'd'] Values: [1, 2, 3, 4] Updated Map: {'a': 1, 'b': 2, 'c': 3, 'd': 4}


In [40]:
#Aggregate Functions (Spark DataFrame)

df_age = spark.createDataFrame([
    ("Alice", 34),
    ("Bob", 45),
    ("Alice", 29)
], ["name", "age"])

df_age.groupBy("name").agg(
    avg("age").alias("average_age"),
    sum("age").alias("total_age")
).show()

+-----+-----------+---------+
| name|average_age|total_age|
+-----+-----------+---------+
|Alice|       31.5|       63|
|  Bob|       45.0|       45|
+-----+-----------+---------+



In [38]:
# Window Functions

from pyspark.sql.functions import col
windowSpec = Window.partitionBy("name").orderBy(col("age"))
df_age.withColumn("rank", rank().over(windowSpec)).show()

+-----+---+----+
| name|age|rank|
+-----+---+----+
|Alice| 29|   1|
|Alice| 34|   2|
|  Bob| 45|   1|
+-----+---+----+



In [39]:
#Sort Functions
lst = [3, 1, 4, 1, 5, 9]
print("Sorted List:", sorted(lst))

Sorted List: [1, 1, 3, 4, 5, 9]


In [None]:
#JSON Functions
data = json.loads('{"name":"John","age":30}')
print("Name:", data["name"], "Age:", data["age"])