pip install pyspark


In [12]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Key-Value Pair RDDs Exercise") \
    .getOrCreate()

# Access SparkContext from SparkSession
sc = spark.sparkContext


In [15]:
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

sales_rdd = sc.parallelize(sales_data)

# printing first few elements
print("Sales Data RDD:",sales_rdd.take(3))


Sales Data RDD: [('ProductA', 100), ('ProductB', 150), ('ProductA', 200)]


In [17]:
# Grouping data by product name
grouped_rdd = sales_rdd.groupByKey()

# Printing the grouped data
print("Grouped Data:")
for x, y in grouped_rdd.collect():
    print(f"{x}: {list(y)}")


Grouped Data:
ProductA: [100, 200]
ProductB: [150, 250]
ProductC: [300, 100]


In [18]:
# Calculate total sales for each product
total_sales_rdd = sales_rdd.reduceByKey(lambda a, b: a + b)

# Printing the total sales for each product
print("Total Sales by Product:")
for product, total_sales in total_sales_rdd.collect():
    print(f"{product}: {total_sales}")


Total Sales by Product:
ProductA: 300
ProductB: 400
ProductC: 400


In [22]:
# Sorting products by total sales in descending order
sorted_sales_rdd = total_sales_rdd.sortBy(lambda x: x[1], ascending=False)

# Printing the sorted list of products with their sales amounts
print("Sorted Products by Total Sales:")
for product, total_sales in sorted_sales_rdd.collect():
    print(f"{product}: {total_sales}")


Sorted Products by Total Sales:
ProductB: 400
ProductC: 400
ProductA: 300


In [23]:
# Filter products with total sales greater than 200
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)

# Print the products that meet the condition
print("Products with Sales Greater Than 200:")
for product, total_sales in high_sales_rdd.collect():
    print(f"{product}: {total_sales}")


Products with Sales Greater Than 200:
ProductA: 300
ProductB: 400
ProductC: 400


In [24]:
# Filter products with total sales greater than 200
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)

# Print the products that meet the condition
print("Products with Sales Greater Than 200:")
for product, total_sales in high_sales_rdd.collect():
    print(f"{product}: {total_sales}")


Products with Sales Greater Than 200:
ProductA: 300
ProductB: 400
ProductC: 400


In [25]:
# Regional sales data
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

# RDD for the regional sales data
regional_sales_rdd = sc.parallelize(regional_sales_data)

# Combine the original sales RDD with the regional sales RDD
combined_rdd = sales_rdd.union(regional_sales_rdd)

# new total sales for each product
new_total_sales_rdd = combined_rdd.reduceByKey(lambda a, b: a + b)

# Printing the combined sales data
print("Combined Sales Data:")
for product, total_sales in new_total_sales_rdd.collect():
    print(f"{product}: {total_sales}")


Combined Sales Data:
ProductA: 350
ProductC: 550
ProductB: 400


In [26]:
# Number of distinct products
distinct_products_count = sales_rdd.map(lambda x: x[0]).distinct().count()

# Printing the count of distinct products
print(f"Number of Distinct Products: {distinct_products_count}")


Number of Distinct Products: 3


In [29]:
# Product with the maximum total sales
max_sales_product = total_sales_rdd.reduce(lambda a, b: a if a[1] > b[1] else b)

# Printing the product name and its total sales amount
print(f"Product with Maximum Sales: {max_sales_product[0]}, Total Sales: {max_sales_product[1]}")


Product with Maximum Sales: ProductC, Total Sales: 400


In [30]:
# Average sales amount per product
product_count_rdd = sales_rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average_sales_rdd = product_count_rdd.mapValues(lambda x: x[0] / x[1])

# Printing the average sales for each product
print("Average Sales Per Product:")
for product, avg_sales in average_sales_rdd.collect():
    print(f"{product}: {avg_sales:.2f}")


Average Sales Per Product:
ProductA: 150.00
ProductB: 200.00
ProductC: 200.00
