In [0]:
# Sample data for categories and products
#Count the number of products in each category
from pyspark.sql.functions import count
data_category = [
    ("Electronics", "Laptop"),
    ("Electronics", "Phone"),
    ("Clothing", "T-Shirt"),
    ("Clothing", "Jeans"),
    ("Furniture", "Chair"),
]

columns_category = ["category", "product"]

# Creating the first DataFrame
df_category = spark.createDataFrame(data_category, columns_category)
df_category_c = df_category.groupby("category").agg(count("product").alias("product_count"))
df_category_c.show()

+-----------+-------------+
|   category|product_count|
+-----------+-------------+
|Electronics|            2|
|   Clothing|            2|
|  Furniture|            1|
+-----------+-------------+



In [0]:

#Calculate the minimum, maximum, and average price for each product
from pyspark.sql.functions import count,min,max,avg
data_price = [
    ("Laptop", 1000),
    ("Phone", 500),
    ("T-Shirt", 20),
    ("Jeans", 50),
    ("Chair", 150),
    ("Chair", 15)
]

columns_price = ["product", "price"]

# Creating the second DataFrame
df_price = spark.createDataFrame(data_price, columns_price)
df_price_agg = df_price.groupby("product").agg(min("price").alias("min_price")
                                          ,max("price").alias("mx_price")
                                          ,avg("price").alias("avg_price"))
df_price.show()
df_price_agg.show()

+-------+-----+
|product|price|
+-------+-----+
| Laptop| 1000|
|  Phone|  500|
|T-Shirt|   20|
|  Jeans|   50|
|  Chair|  150|
|  Chair|   15|
+-------+-----+

+-------+---------+--------+---------+
|product|min_price|mx_price|avg_price|
+-------+---------+--------+---------+
| Laptop|     1000|    1000|   1000.0|
|  Phone|      500|     500|    500.0|
|T-Shirt|       20|      20|     20.0|
|  Jeans|       50|      50|     50.0|
|  Chair|       15|     150|     82.5|
+-------+---------+--------+---------+



In [0]:

#Group sales by month and year, and calculate the total amount for each month-year
combination.
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, sum

# Initialize Spark session
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

# Sample data
sales_data = [
    ("2023-01-01", "New York", 100),
    ("2023-02-15", "London", 200),
    ("2023-03-10", "Paris", 300),
    ("2023-04-20", "Berlin", 400),
    ("2023-05-05", "Tokyo", 500),
]

columns = ["order_date", "city", "amount"]

# Create DataFrame
df_sales = spark.createDataFrame(sales_data, columns)
df_sales_month_year = df_sales.withColumn("Year",year("order_date")).withColumn("Month",month("order_date"))
df_sales_month_year_agg = df_sales_month_year.groupby("Year","Month").agg(sum("amount").alias("sum amounth"))
df_sales_month_year_agg.show()


+----+-----+-----------+
|Year|Month|sum amounth|
+----+-----+-----------+
|2023|    1|        100|
|2023|    2|        200|
|2023|    3|        300|
|2023|    4|        400|
|2023|    5|        500|
+----+-----+-----------+

+----------+--------+------+
|order_date|    city|amount|
+----------+--------+------+
|2023-01-01|New York|   100|
|2023-02-15|  London|   200|
|2023-03-10|   Paris|   300|
|2023-04-20|  Berlin|   400|
|2023-05-05|   Tokyo|   500|
+----------+--------+------+



In [0]:

#Find the top 5 products (by total quantity sold) across all orders
from pyspark.sql.functions import col, sum

# Sample data
product_data = [
    ("Laptop", "order_1", 2),
    ("Phone", "order_2", 1),
    ("T-Shirt", "order_1", 3),
    ("Jeans", "order_3", 4),
    ("Chair", "order_2", 2),
]

columns = ["product", "order_id", "quantity"]

# Create DataFrame
df_products = spark.createDataFrame(product_data, columns)
df_products_agg_top_3 = (
    df_products.groupBy("product")
    .agg(sum("quantity").alias("sum"))
    .orderBy(col("sum").desc())
    .limit(3)
)

# Show results
df_products_agg_top_3.show()

+-------+---+
|product|sum|
+-------+---+
|  Jeans|  4|
|T-Shirt|  3|
| Laptop|  2|
+-------+---+



In [0]:
# calculate the average rating given by each user
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Initialize Spark session
spark = SparkSession.builder.appName("AverageRatingPerUser").getOrCreate()

# Sample data
rating_data = [
    (1, 1, 4),
    (1, 2, 5),
    (2, 1, 3),
    (2, 3, 4),
    (3, 2, 5),
]

columns = ["user_id", "product_id", "rating"]

# Create DataFrame
df_ratings = spark.createDataFrame(rating_data, columns)
df_ratings_agg = df_ratings.groupBy("user_id").agg(avg("rating").alias("AVG R"))
df_ratings_agg.show()
df_ratings.show()

+-------+-----+
|user_id|AVG R|
+-------+-----+
|      1|  4.5|
|      2|  3.5|
|      3|  5.0|
+-------+-----+

+-------+----------+------+
|user_id|product_id|rating|
+-------+----------+------+
|      1|         1|     4|
|      1|         2|     5|
|      2|         1|     3|
|      2|         3|     4|
|      3|         2|     5|
+-------+----------+------+



In [0]:

#Group customers by country and calculate the total amount spent by customers in each country
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum

# Initialize Spark session
spark = SparkSession.builder.appName("TotalSpendByCountry").getOrCreate()

# Sample data
customer_data = [
    (1, "USA", "order_1", 100),
    (1, "USA", "order_2", 200),
    (2, "UK", "order_3", 150),
    (3, "France", "order_4", 250),
    (3, "France", "order_5", 300),
]

columns = ["customer_id", "country", "order_id", "amount"]

# Create DataFrame
df_customers = spark.createDataFrame(customer_data, columns)

# Group by country and calculate total amount spent
df_total_spend = df_customers.groupBy("country").agg(sum("amount").alias("total_spent"))

# Show results
df_total_spend.show()


+-------+-----------+
|country|total_spent|
+-------+-----------+
|    USA|        300|
|     UK|        150|
| France|        550|
+-------+-----------+



In [0]:
#Identify products that had no sales between 2023-02-01 and 2023-03-31.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("NoSalesProducts").getOrCreate()

# Sample data
sales_data = [
    ("Laptop", "2023-01-01"),
    ("Phone", "2023-02-15"),
    ("T-Shirt", "2023-03-10"),
    ("Jeans", "2023-04-20"),
]

columns = ["product", "order_date"]

# Create DataFrame
df_sales = spark.createDataFrame(sales_data, columns)

# Filter for sales within the date range
df_sales_in_range = df_sales.filter((col("order_date") >= "2023-02-01") & (col("order_date") <= "2023-03-31"))
df_sales_in_range.show()
# Find products with no sales in the specified range
df_no_sales = df_sales.join(df_sales_in_range, "product", "left_anti")
# Show results
df_no_sales.show()


+-------+----------+
|product|order_date|
+-------+----------+
|  Phone|2023-02-15|
|T-Shirt|2023-03-10|
+-------+----------+

+-------+----------+
|product|order_date|
+-------+----------+
| Laptop|2023-01-01|
|  Jeans|2023-04-20|
+-------+----------+



In [0]:

#Calculate Order Count per Customer and City
from pyspark.sql.functions import count

# Sample data
customer_data = [
    (1, "New York", "order_1"),
    (1, "New York", "order_2"),
    (2, "London", "order_3"),
    (3, "Paris", "order_4"),
]

columns = ["customer_id", "city", "order_id"]

# Create DataFrame
df_customers = spark.createDataFrame(customer_data, columns)

# Group by customer and city, and count the number of orders
df_order_count = df_customers.groupBy("customer_id", "city").agg(count("order_id").alias("order_count"))

# Show results
df_order_count.show()


+-----------+--------+-----------+
|customer_id|    city|order_count|
+-----------+--------+-----------+
|          1|New York|          2|
|          2|  London|          1|
|          3|   Paris|          1|
+-----------+--------+-----------+



In [0]:

#Group Orders by Weekday and Calculate Average Order Value (when-otherwise)
from pyspark.sql import SparkSession
from pyspark.sql.functions import dayofweek, when, avg, col

# Initialize Spark session
spark = SparkSession.builder.appName("WeekdayWeekendAverageOrderValue").getOrCreate()

# Sample data
order_data = [
    ("2023-04-10", 1, 100),  # Monday
    ("2023-04-11", 2, 200),  # Tuesday
    ("2023-04-12", 3, 300),  # Wednesday
    ("2023-04-13", 1, 400),  # Thursday
    ("2023-04-14", 2, 500),  # Friday
    ("2023-04-15", 3, 600),  # Saturday (Weekend)
    ("2023-04-16", 1, 700),  # Sunday (Weekend)
]

columns = ["order_date", "customer_id", "amount"]

# Create DataFrame
df_orders = spark.createDataFrame(order_data, columns)

# Add a weekday number column (1 = Sunday, ..., 7 = Saturday)
df_orders = df_orders.withColumn("weekday_number", dayofweek("order_date"))

# Use when-otherwise to classify as "Weekday" or "Weekend"
df_orders = df_orders.withColumn(
    "day_type",
    when(col("weekday_number").between(2, 6), "Weekday")  # Monday to Friday
    .otherwise("Weekend")  # Saturday and Sunday
)

# Group by "day_type" and calculate the average order value
df_avg_order_value = df_orders.groupBy("day_type").agg(avg("amount").alias("avg_order_value"))

# Show results
df_avg_order_value.show()


+--------+---------------+
|day_type|avg_order_value|
+--------+---------------+
| Weekday|          300.0|
| Weekend|          650.0|
+--------+---------------+



In [0]:
# Filter Products Starting with "T" and Group by Category with Average Price
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Initialize Spark session
spark = SparkSession.builder.appName("FilteredProductCategoryAveragePrice").getOrCreate()

# Sample data
product_data = [
    ("T-Shirt", "Clothing", 20),
    ("Table", "Furniture", 150),
    ("Jeans", "Clothing", 50),
    ("Chair", "Furniture", 100),
]

columns = ["product", "category", "price"]

# Create DataFrame
df_products = spark.createDataFrame(product_data, columns)

# Filter products starting with "T" and group by category
df_filtered_avg_price = df_products.filter(col("product").startswith("T")) \
                                   .groupBy("category") \
                                   .agg(avg("price").alias("avg_price"))

# Show results
df_filtered_avg_price.show()


+---------+---------+
| category|avg_price|
+---------+---------+
| Clothing|     20.0|
|Furniture|    150.0|
+---------+---------+



In [0]:
#Group customers by customer ID and calculate the total amount spent. Filter customers who spent more than $200 in total.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerTotalSpend").getOrCreate()

# Sample data
order_data = [
    (1, "order_1", 100),
    (1, "order_2", 150),
    (2, "order_3", 250),
    (3, "order_4", 100),
    (3, "order_5", 120),
]

columns = ["customer_id", "order_id", "amount"]

# Create DataFrame
df_orders = spark.createDataFrame(order_data, columns)
total_200_g  = df_orders.groupBy("customer_id").agg(sum("amount").alias("total")).filter(col("total")>220)
total_200_g.show()

+-----------+-----+
|customer_id|total|
+-----------+-----+
|          1|  250|
|          2|  250|
+-----------+-----+



In [0]:
#Create a New Column with Order Status ("High" for > $100, "Low" Otherwise)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder.appName("OrderStatus").getOrCreate()

# Sample data
order_data = [
    ("order_1", 150),
    ("order_2", 80),
    ("order_3", 220),
    ("order_4", 50),
]

columns = ["order_id", "amount"]

# Create DataFrame
df_orders = spark.createDataFrame(order_data, columns)

# Create a new column "order_status" based on the "amount"
df_orders_with_status = df_orders.withColumn(
    "order_status",
    when(col("amount") > 100, "High").otherwise("Low")
)
# Show results
df_orders_with_status.show()


+--------+------+------------+
|order_id|amount|order_status|
+--------+------+------------+
| order_1|   150|        High|
| order_2|    80|         Low|
| order_3|   220|        High|
| order_4|    50|         Low|
+--------+------+------------+

