In [4]:
!apt-get install openjdk-11-jdk -qq > /dev/null

# Download Spark 3.5.1 built with Hadoop 3
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz

# Extract Spark
!tar -xzf spark-3.5.1-bin-hadoop3.tgz

# Install findspark helper
!pip install -q findspark

In [5]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [6]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ColabSparkSetup").getOrCreate()
print("✅ Spark version:", spark.version)

✅ Spark version: 3.5.1


In [8]:
#PRACTICAL 4: Spark Aggregate Operations

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, countDistinct, row_number
from pyspark.sql.window import Window

In [9]:
#Create Spark Session
spark = SparkSession.builder.appName("SparkAggregations").getOrCreate()

# Create DataFrame
data = [
    ("Alice", "Math", 85),
    ("Bob", "Math", 90),
    ("Alice", "Physics", 95),
    ("Bob", "Physics", 80)
]

df = spark.createDataFrame(data, ["name", "subject", "score"])

print("Original DataFrame:")
df.show()

Original DataFrame:
+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Alice|   Math|   85|
|  Bob|   Math|   90|
|Alice|Physics|   95|
|  Bob|Physics|   80|
+-----+-------+-----+



In [10]:
#Group Rows in DataFrame (Average Score)

groupedDF = df.groupBy("name").agg(avg("score").alias("average_score"))
print("Grouped by name with average score:")
groupedDF.show()

Grouped by name with average score:
+-----+-------------+
| name|average_score|
+-----+-------------+
|  Bob|         85.0|
|Alice|         90.0|
+-----+-------------+



In [11]:
#Get Count Distinct on DataFrame

distinctCountDF = df.select(countDistinct("name").alias("count_distinct_name"))
print("Count of distinct names:")
distinctCountDF.show()

Count of distinct names:
+-------------------+
|count_distinct_name|
+-------------------+
|                  2|
+-------------------+



In [12]:
 #Add Row Number to DataFrame

windowSpec = Window.orderBy("name")
dfWithRowNumber = df.withColumn("row_number", row_number().over(windowSpec))
print("DataFrame with row numbers:")
dfWithRowNumber.show()

DataFrame with row numbers:
+-----+-------+-----+----------+
| name|subject|score|row_number|
+-----+-------+-----+----------+
|Alice|   Math|   85|         1|
|Alice|Physics|   95|         2|
|  Bob|   Math|   90|         3|
|  Bob|Physics|   80|         4|
+-----+-------+-----+----------+



In [13]:
#Select the First Row of Each Group

windowSpecFirst = Window.partitionBy("name").orderBy("score")
firstRowDF = (
    df.withColumn("row_number", row_number().over(windowSpecFirst))
      .filter("row_number == 1")
      .drop("row_number")
)
print("First row of each group:")
firstRowDF.show()

First row of each group:
+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Alice|   Math|   85|
|  Bob|Physics|   80|
+-----+-------+-----+

