In [1]:
spark

## Import librairies

In [2]:
import os 

## RDD Operations

In [7]:
sc = spark.sparkContext
sc

#### RDD - 1. Load CSV into RDDs

In [8]:
# Load data file 

contoso_dataset_path = "abfss://dlsfssynwformwtwfrctrl@dlsaccsynwformwtwfrctrl.dfs.core.windows.net/synapse/workspaces/synw-formwtw-frctrl/warehouse/dataset/contoso/"

product_path = os.path.join(contoso_dataset_path, "DimProduct.csv")
sales_path   = os.path.join(contoso_dataset_path, "FactSales.csv")

rdd_products = sc.textFile(product_path)
rdd_sales    = sc.textFile(sales_path)

In [9]:
rdd_products

In [10]:
rdd_sales

In [11]:
# Ignore headers and split:

header_p = rdd_products.first()
rdd_products = rdd_products.filter(lambda l: l != header_p).map(lambda l: l.split(","))

header_s = rdd_sales.first()
rdd_sales = rdd_sales.filter(lambda l: l != header_s).map(lambda l: l.split(","))

In [12]:
header_p

In [13]:
header_s

#### RDD - 2. Inspect RDDs

In [14]:
print("Product sample:", rdd_products.take(3))

In [15]:
print("Total products:", rdd_products.count())

In [16]:
print("Sales sample:", rdd_sales.take(3))

In [17]:
print("Total sales records:", rdd_sales.count())

#### RDD - 3. Perform Map, Filter, Reduce Operations

In [18]:
# Extract product names
product_names = rdd_products.map(lambda fields: fields[2])  # name at index 2
print("Sample names:", product_names.take(5))

In [19]:
# Filter expensive products (> $1000)
expensive = rdd_products.filter(lambda fields: float(fields[4]) > 1000.0)  # price at idx 4
# print("Expensive products:", expensive.take(5))

In [20]:
# Map-product to sales join via keyBy
# product_id assumed at index 0; sale fields: product_id idx 2, quantity at idx 5
prod_kv = rdd_products.map(lambda f: (f[0], f))
sales_kv = rdd_sales.map(lambda f: (f[2], f))

# join and create records (prod, qty)
prod_sales = prod_kv.join(sales_kv).map(lambda kv: (kv[1][0][2], int(kv[1][1][5])))
print("Product‚ÄìSales sample:", prod_sales.take(5))

In [21]:
# Aggregate total quantity sold per product
total_per_prod = prod_sales.reduceByKey(lambda x, y: x + y)
print("Totals:", total_per_prod.take(5))

#### RDD - 4. Advanced Transformation: Word Count from Product Descriptions

In [22]:
# assuming description at idx 3
words = rdd_products.flatMap(lambda f: f[3].split(" "))
word_counts = words.map(lambda w: (w.lower(), 1)).reduceByKey(lambda a, b: a + b)
print("Top words:", word_counts.takeOrdered(10, key=lambda kv: -kv[1]))

#### RDD - 5. Convert RDD to DataFrame & Use SQL

In [23]:
from pyspark.sql import Row

# Convert product totals
row_rdd = total_per_prod.map(lambda kv: Row(product_name=kv[0], qty_sold=kv[1]))
df_totals = spark.createDataFrame(row_rdd)

df_totals.createOrReplaceTempView("prod_totals")
spark.sql("SELECT * FROM prod_totals ORDER BY qty_sold DESC").show(10)

#### ‚úÖ Summary
    Loading raw CSVs into RDDs
    Transforming data with map, filter, reduceByKey, and flatMap
    Joining product and sales data via key-pairs
    Aggregating insights using RDD logic
    Converting to DataFrames for SQL or visualization

## FEATURES

#### üßæ Feature Matrix


| Feature                            | üüß Databricks Serverless | üü¶ Databricks Standard Cluster | üî∑ Synapse Spark Pool | ‚ö´ Apache Spark (Standard Cluster) |
| ---------------------------------- | ------------------------ | ------------------------------ | --------------------- | --------------------------------- |
| **`sparkContext` access**          | ‚ùå Blocked                | ‚úÖ Full Access                  | ‚úÖ Full Access         | ‚úÖ Full Access                     |
| **RDD creation (`parallelize`)**   | ‚ùå Not supported          | ‚úÖ Yes                          | ‚úÖ Yes                 | ‚úÖ Yes                             |
| **JVM access (`_jvm`, broadcast)** | ‚ùå Not allowed            | ‚úÖ Yes                          | ‚úÖ Yes                 | ‚úÖ Yes                             |
| **`df.cache()`** (memory caching)  | ‚ùå Ignored or fails       | ‚úÖ Supported                    | ‚úÖ Supported           | ‚úÖ Supported                       |
| **`DataFrame` API**                | ‚úÖ Recommended            | ‚úÖ Recommended                  | ‚úÖ Recommended         | ‚úÖ Recommended                     |
| **Best use case**                  | Interactive SQL & BI     | Full Spark / ML workloads      | Data engineering + ML | Full control (custom Spark apps)  |
| **Resource control**               | Abstracted               | Full control                   | Semi-managed          | Full control                      |
| **Supports custom Spark configs**  | ‚ùå Very limited           | ‚úÖ Fully customizable           | ‚ö†Ô∏è Limited            | ‚úÖ Fully customizable              |
| **Cluster reuse across users**     | ‚úÖ Auto scale & share     | ‚úÖ Manual control               | ‚úÖ Session-pool based  | ‚úÖ Depends on setup                |
| **Interactive performance tuning** | ‚ùå Limited                | ‚úÖ Yes                          | ‚ö†Ô∏è Limited            | ‚úÖ Full control                    |
