In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Unit2Labs").getOrCreate()

from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [None]:
nums = spark.read.option("header",True).option("inferSchema",True).csv("/content/numbers.csv")

In [None]:
# even and odd
even = nums.filter((F.col("Value") % 2) == 0)
odd  = nums.filter((F.col("Value") % 2) != 0)
print("Evens:"); even.show()
print("Odds:");  odd.show()

# stats: max, min, sum, avg
stats = nums.agg(
    F.max("Value").alias("max_val"),
    F.min("Value").alias("min_val"),
    F.sum("Value").alias("sum_val"),
    F.round(F.avg("Value"),2).alias("avg_val")
)
stats.show()


Evens:
+-----+
|Value|
+-----+
|    2|
|    4|
|   20|
+-----+

Odds:
+-----+
|Value|
+-----+
|    3|
|   17|
|  121|
+-----+

+-------+-------+-------+-------+
|max_val|min_val|sum_val|avg_val|
+-------+-------+-------+-------+
|    121|      2|    167|  27.83|
+-------+-------+-------+-------+



In [None]:

# prime number check via UDF (suitable for teaching; not best for huge datasets)
from pyspark.sql.types import BooleanType
def is_prime(n):
    if n is None or n < 2:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    i = 3
    while i * i <= n:
        if n % i == 0:
            return False
        i += 2
    return True

is_prime_udf = F.udf(is_prime, BooleanType())
primes = nums.filter(is_prime_udf(F.col("Value")))
print("Primes:")
primes.show()


Primes:
+-----+
|Value|
+-----+
|    2|
|    3|
|   17|
+-----+



In [None]:
rdd = nums.rdd.map(lambda row: row["Value"])
total_sum = rdd.reduce(lambda a,b: a+b)
even_count = rdd.filter(lambda x: x % 2 == 0).count()
print("sum=", total_sum, "even_count=", even_count)



sum= 167 even_count= 3


In [None]:
people = spark.read.option("header",True).option("inferSchema",True).csv("/content/people.csv")

In [None]:
# 1. Categorize
people_cat = people.withColumn(
    "Category",
    F.when(F.col("Age") < 18, "Minor")
     .when((F.col("Age") >= 18) & (F.col("Age") <= 59), "Adult")
     .otherwise("Senior")
)
people_cat.show()


+-----+---+--------+
| Name|Age|Category|
+-----+---+--------+
|Dhana| 29|   Adult|
| Ravi| 16|   Minor|
|  Anu| 64|  Senior|
|Meena| 45|   Adult|
|Kumar| 12|   Minor|
+-----+---+--------+



In [None]:
# 2. Count per category
people_cat.groupBy("Category").count().show()


+--------+-----+
|Category|count|
+--------+-----+
|  Senior|    1|
|   Minor|    2|
|   Adult|    2|
+--------+-----+



In [None]:

# 3. Oldest and youngest person
oldest = people.orderBy(F.desc("Age")).limit(1)
youngest = people.orderBy(F.asc("Age")).limit(1)
print("Oldest:"); oldest.show()
print("Youngest:"); youngest.show()


Oldest:
+----+---+
|Name|Age|
+----+---+
| Anu| 64|
+----+---+

Youngest:
+-----+---+
| Name|Age|
+-----+---+
|Kumar| 12|
+-----+---+



In [None]:

sales = spark.read.option("header",True).option("inferSchema",True).csv("/content/sales.csv")


In [None]:

# 1. Revenue per product
sales_with_rev = sales.withColumn("Revenue", F.col("Quantity") * F.col("Price"))
rev_per_product = sales_with_rev.groupBy("Product","Category").agg(F.sum("Revenue").alias("total_revenue"), F.sum("Quantity").alias("total_qty"))
rev_per_product.orderBy(F.desc("total_revenue")).show()


+--------+------------+-------------+---------+
| Product|    Category|total_revenue|total_qty|
+--------+------------+-------------+---------+
|    Rice|   Groceries|      20000.0|      500|
| Shampoo|PersonalCare|       9600.0|       80|
|   Sugar|   Groceries|       2700.0|       60|
|    Soap|PersonalCare|       2460.0|      120|
|Notebook|  Stationery|        750.0|       30|
+--------+------------+-------------+---------+



In [None]:
# 2. Best-selling product and category (by quantity)
best_product = rev_per_product.orderBy(F.desc("total_qty")).limit(1)
best_product.show()

best_category = sales.groupBy("Category").agg(F.sum("Quantity").alias("category_qty")).orderBy(F.desc("category_qty")).limit(1)
best_category.show()



+-------+---------+-------------+---------+
|Product| Category|total_revenue|total_qty|
+-------+---------+-------------+---------+
|   Rice|Groceries|      20000.0|      500|
+-------+---------+-------------+---------+

+---------+------------+
| Category|category_qty|
+---------+------------+
|Groceries|         560|
+---------+------------+



In [None]:
# 3. Products with sales below 100 units (total quantity across dataset)
low_sales_products = rev_per_product.filter(F.col("total_qty") < 100)
low_sales_products.show()


+--------+------------+-------------+---------+
| Product|    Category|total_revenue|total_qty|
+--------+------------+-------------+---------+
|Notebook|  Stationery|        750.0|       30|
|   Sugar|   Groceries|       2700.0|       60|
| Shampoo|PersonalCare|       9600.0|       80|
+--------+------------+-------------+---------+

