In [7]:
pip install pyspark



# **E-COMMERCE TRANSACTIONS**

In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, max, count, expr,min

# Initialize SparkSession
spark = SparkSession.builder.appName("E-commerce Transactions").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, 101, "Laptop", "Electronics", 1000, 1, 10, "2023-08-01"),
    (2, 102, "Smartphone", "Electronics", 700, 2, 5, "2023-08-01"),
    (3, 103, "Shirt", "Fashion", 40, 3, 0, "2023-08-02"),
    (4, 104, "Blender", "Home Appliance", 150, 1, 15, "2023-08-03"),
    (5, 101, "Headphones", "Electronics", 100, 2, 10, "2023-08-03"),
    (6, 105, "Shoes", "Fashion", 60, 1, 20, "2023-08-04"),
    (7, 106, "Refrigerator", "Home Appliance", 800, 1, 25, "2023-08-05"),
    (8, 107, "Book", "Books", 20, 4, 0, "2023-08-05"),
    (9, 108, "Toaster", "Home Appliance", 30, 1, 5, "2023-08-06"),
    (10, 102, "Tablet", "Electronics", 300, 2, 10, "2023-08-06")
]
columns = ["transaction_id", "customer_id", "product", "category", "price", "quantity", "discount_percentage", "transaction_date"]

# Create DataFrame
ecommerce_df = spark.createDataFrame(data, columns)
ecommerce_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|   

In [87]:
# 1. Calculate the Total Revenue per Category
total_revenue_per_category = ecommerce_df.withColumn(
    "revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100)
).groupBy("category").agg(sum("revenue").alias("total_revenue"))
total_revenue_per_category.show()

+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|        168.0|
|   Electronics|       2950.0|
|Home Appliance|        756.0|
|         Books|         80.0|
+--------------+-------------+



In [88]:
# 2. Filter Transactions with a Discount Greater Than 10%
discount_greater_than_10 = ecommerce_df.filter(col("discount_percentage") > 10)
discount_greater_than_10.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+



In [89]:
# 3. Find the Most Expensive Product Sold
most_expensive_product = ecommerce_df.orderBy(col("price").desc()).limit(1)
most_expensive_product.show()

+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|product|   category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+
|             1|        101| Laptop|Electronics| 1000|       1|                 10|      2023-08-01|
+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+



In [90]:
# 4. Calculate the Average Quantity of Products Sold per Category
avg_quantity_per_category = ecommerce_df.groupBy("category").agg(avg("quantity").alias("avg_quantity"))
avg_quantity_per_category.show()

+--------------+------------+
|      category|avg_quantity|
+--------------+------------+
|       Fashion|         2.0|
|   Electronics|        1.75|
|Home Appliance|         1.0|
|         Books|         4.0|
+--------------+------------+



In [91]:
# 5. Identify Customers Who Purchased More Than One Product
multiple_products_customers = ecommerce_df.groupBy("customer_id", "transaction_id").agg(sum("quantity").alias("total_quantity")).filter(col("total_quantity") > 1)
multiple_products_customers.show()

+-----------+--------------+--------------+
|customer_id|transaction_id|total_quantity|
+-----------+--------------+--------------+
|        101|             5|             2|
|        102|             2|             2|
|        103|             3|             3|
|        107|             8|             4|
|        102|            10|             2|
+-----------+--------------+--------------+



In [92]:
# 6. Find the Top 3 Highest Revenue Transactions
top_3_revenue_transactions = ecommerce_df.withColumn(
    "revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100)
).groupBy("transaction_id").agg(sum("revenue").alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(3)
top_3_revenue_transactions.show()

+--------------+-------------+
|transaction_id|total_revenue|
+--------------+-------------+
|             2|       1330.0|
|             1|        900.0|
|             7|        600.0|
+--------------+-------------+



In [93]:
# 7. Calculate the Total Number of Transactions per Day
total_transactions_per_day = ecommerce_df.groupBy("transaction_date").agg(count("transaction_id").alias("total_transactions"))
total_transactions_per_day.show()

+----------------+------------------+
|transaction_date|total_transactions|
+----------------+------------------+
|      2023-08-01|                 2|
|      2023-08-02|                 1|
|      2023-08-03|                 2|
|      2023-08-06|                 2|
|      2023-08-04|                 1|
|      2023-08-05|                 2|
+----------------+------------------+



In [94]:
# 8. Find the Customer Who Spent the Most Money
customer_spending = ecommerce_df.withColumn(
    "revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100)
).groupBy("customer_id").agg(sum("revenue").alias("total_spent")).orderBy(col("total_spent").desc()).limit(1)
customer_spending.show()

+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|        102|     1870.0|
+-----------+-----------+



In [95]:
# 9. Calculate the Average Discount Given per Product Category
avg_discount_per_category = ecommerce_df.groupBy("category").agg(avg("discount_percentage").alias("avg_discount"))
avg_discount_per_category.show()

+--------------+------------+
|      category|avg_discount|
+--------------+------------+
|       Fashion|        10.0|
|   Electronics|        8.75|
|Home Appliance|        15.0|
|         Books|         0.0|
+--------------+------------+



In [96]:
# 10. Create a New Column for Final Price After Discount
ecommerce_df = ecommerce_df.withColumn(
    "final_price", col("price") * (1 - col("discount_percentage") / 100))
ecommerce_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|final_price|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|      900.0|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|      665.0|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|       40.0|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|      127.5|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      2023-08-03|       90.0|
|             6|        105|    

# **BANKING TRANSACTIONS**

In [9]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Banking Transactions").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, 201, "Deposit", 5000, "2023-09-01"),
    (2, 202, "Withdrawal", 2000, "2023-09-01"),
    (3, 203, "Deposit", 3000, "2023-09-02"),
    (4, 201, "Withdrawal", 1500, "2023-09-02"),
    (5, 204, "Deposit", 10000, "2023-09-03"),
    (6, 205, "Withdrawal", 500, "2023-09-03"),
    (7, 202, "Deposit", 2500, "2023-09-04"),
    (8, 206, "Withdrawal", 700, "2023-09-04"),
    (9, 203, "Deposit", 4000, "2023-09-05"),
    (10, 204, "Withdrawal", 3000, "2023-09-05")
]
columns = ["transaction_id", "customer_id", "transaction_type", "amount", "transaction_date"]

# Create DataFrame
banking_df = spark.createDataFrame(data, columns)
banking_df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             6|        205|      Withdrawal|   500|      2023-09-03|
|             7|        202|         Deposit|  2500|      2023-09-04|
|             8|        206|      Withdrawal|   700|      2023-09-04|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [10]:
# 1. Calculate the Total Deposit and Withdrawal Amounts
total_amounts = banking_df.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_amounts.show()

+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+



In [11]:
# 2. Filter Transactions Greater Than $3,000
transactions_gt_3000 = banking_df.filter(col("amount") > 3000)
transactions_gt_3000.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [12]:
# 3. Find the Largest Deposit Made
largest_deposit = banking_df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).limit(1)
largest_deposit.show()


+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             5|        204|         Deposit| 10000|      2023-09-03|
+--------------+-----------+----------------+------+----------------+



In [13]:
# 4. Calculate the Average Transaction Amount for Each Transaction Type
avg_amount_per_type = banking_df.groupBy("transaction_type").agg(avg("amount").alias("avg_amount"))
avg_amount_per_type.show()

+----------------+----------+
|transaction_type|avg_amount|
+----------------+----------+
|         Deposit|    4900.0|
|      Withdrawal|    1540.0|
+----------------+----------+



In [14]:
# 5. Find Customers Who Made Both Deposits and Withdrawals
deposit_withdrawal_customers = banking_df.groupBy("customer_id").agg(
    sum(expr("CASE WHEN transaction_type = 'Deposit' THEN 1 ELSE 0 END")).alias("deposits"),
    sum(expr("CASE WHEN transaction_type = 'Withdrawal' THEN 1 ELSE 0 END")).alias("withdrawals")
).filter((col("deposits") > 0) & (col("withdrawals") > 0))
deposit_withdrawal_customers.show()

+-----------+--------+-----------+
|customer_id|deposits|withdrawals|
+-----------+--------+-----------+
|        202|       1|          1|
|        201|       1|          1|
|        204|       1|          1|
+-----------+--------+-----------+



In [15]:
# 6. Calculate the Total Amount of Transactions per Day
total_amount_per_day = banking_df.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
total_amount_per_day.show()

+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-01|        7000|
|      2023-09-02|        4500|
|      2023-09-03|       10500|
|      2023-09-05|        7000|
|      2023-09-04|        3200|
+----------------+------------+



In [16]:
# 7. Find the Customer with the Highest Total Withdrawal
highest_withdrawal_customer = banking_df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(sum("amount").alias("total_withdrawn")).orderBy(col("total_withdrawn").desc()).limit(1)
highest_withdrawal_customer.show()

+-----------+---------------+
|customer_id|total_withdrawn|
+-----------+---------------+
|        204|           3000|
+-----------+---------------+



In [17]:
# 8. Calculate the Number of Transactions for Each Customer
transactions_per_customer = banking_df.groupBy("customer_id").agg(count("transaction_id").alias("num_transactions"))
transactions_per_customer.show()

+-----------+----------------+
|customer_id|num_transactions|
+-----------+----------------+
|        202|               2|
|        201|               2|
|        203|               2|
|        204|               2|
|        205|               1|
|        206|               1|
+-----------+----------------+



In [25]:
# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000
withdrawal_gt_1000_dates = banking_df.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000))
transactions_same_day = banking_df.join(withdrawal_gt_1000_dates, on="transaction_date")
transactions_same_day.show()

+----------------+--------------+-----------+----------------+------+--------------+-----------+----------------+------+
|transaction_date|transaction_id|customer_id|transaction_type|amount|transaction_id|customer_id|transaction_type|amount|
+----------------+--------------+-----------+----------------+------+--------------+-----------+----------------+------+
|      2023-09-01|             1|        201|         Deposit|  5000|             2|        202|      Withdrawal|  2000|
|      2023-09-01|             2|        202|      Withdrawal|  2000|             2|        202|      Withdrawal|  2000|
|      2023-09-02|             3|        203|         Deposit|  3000|             4|        201|      Withdrawal|  1500|
|      2023-09-02|             4|        201|      Withdrawal|  1500|             4|        201|      Withdrawal|  1500|
|      2023-09-05|             9|        203|         Deposit|  4000|            10|        204|      Withdrawal|  3000|
|      2023-09-05|            10

In [26]:
# 10. Create a New Column to Classify Transactions as "High" or "Low" Value
banking_df = banking_df.withColumn("transaction_value", expr("CASE WHEN amount > 5000 THEN 'High' ELSE 'Low' END"))
banking_df.show()

+--------------+-----------+----------------+------+----------------+-----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_value|
+--------------+-----------+----------------+------+----------------+-----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|              Low|
|             2|        202|      Withdrawal|  2000|      2023-09-01|              Low|
|             3|        203|         Deposit|  3000|      2023-09-02|              Low|
|             4|        201|      Withdrawal|  1500|      2023-09-02|              Low|
|             5|        204|         Deposit| 10000|      2023-09-03|             High|
|             6|        205|      Withdrawal|   500|      2023-09-03|              Low|
|             7|        202|         Deposit|  2500|      2023-09-04|              Low|
|             8|        206|      Withdrawal|   700|      2023-09-04|              Low|
|             9|        203|    

# **HEALTH & FITNESS TRACKER DATA**

In [28]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Health & Fitness Tracker").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, "2023-09-01", 12000, 500, 7.0, "Cardio"),
    (2, "2023-09-01", 8000, 400, 6.5, "Strength"),
    (3, "2023-09-01", 15000, 650, 8.0, "Yoga"),
    (1, "2023-09-02", 10000, 450, 6.0, "Cardio"),
    (2, "2023-09-02", 9500, 500, 7.0, "Cardio"),
    (3, "2023-09-02", 14000, 600, 7.5, "Strength"),
    (1, "2023-09-03", 13000, 550, 8.0, "Yoga"),
    (2, "2023-09-03", 12000, 520, 6.5, "Yoga"),
    (3, "2023-09-03", 16000, 700, 7.0, "Cardio")
]
columns = ["user_id", "date", "steps", "calories_burned", "hours_of_sleep", "workout_type"]

# Create DataFrame
fitness_df = spark.createDataFrame(data, columns)
fitness_df.show()


+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [29]:
# 1. Find the Total Steps Taken by Each User
total_steps_per_user = fitness_df.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps_per_user.show()

+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      35000|
|      3|      45000|
|      2|      29500|
+-------+-----------+



In [30]:
# 2. Filter Days with More Than 10,000 Steps
days_gt_10000_steps = fitness_df.filter(col("steps") > 10000)
days_gt_10000_steps.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [31]:
# 3. Calculate the Average Calories Burned by Workout Type
avg_calories_per_workout = fitness_df.groupBy("workout_type").agg(avg("calories_burned").alias("avg_calories"))
avg_calories_per_workout.show()

+------------+-----------------+
|workout_type|     avg_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|            537.5|
+------------+-----------------+



In [35]:
# 4. Identify the Day with the Most Steps for Each User
max_steps_per_user = fitness_df.groupBy("user_id").agg(max("steps").alias("steps"))
day_with_max_steps = fitness_df.join(max_steps_per_user ,on=["user_id", "steps"])
day_with_max_steps.show()

+-------+-----+----------+---------------+--------------+------------+
|user_id|steps|      date|calories_burned|hours_of_sleep|workout_type|
+-------+-----+----------+---------------+--------------+------------+
|      3|16000|2023-09-03|            700|           7.0|      Cardio|
|      1|13000|2023-09-03|            550|           8.0|        Yoga|
|      2|12000|2023-09-03|            520|           6.5|        Yoga|
+-------+-----+----------+---------------+--------------+------------+



In [36]:
# 5. Find Users Who Burned More Than 600 Calories on Any Day
users_burned_gt_600_calories = fitness_df.filter(col("calories_burned") > 600).select("user_id").distinct()
users_burned_gt_600_calories.show()

+-------+
|user_id|
+-------+
|      3|
+-------+



In [37]:
# 6. Calculate the Average Hours of Sleep per User
avg_sleep_per_user = fitness_df.groupBy("user_id").agg(avg("hours_of_sleep").alias("avg_sleep"))
avg_sleep_per_user.show()

+-------+-----------------+
|user_id|        avg_sleep|
+-------+-----------------+
|      1|              7.0|
|      3|              7.5|
|      2|6.666666666666667|
+-------+-----------------+



In [38]:
# 7. Find the Total Calories Burned per Day
total_calories_per_day = fitness_df.groupBy("date").agg(sum("calories_burned").alias("total_calories"))
total_calories_per_day.show()

+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-09-01|          1550|
|2023-09-02|          1550|
|2023-09-03|          1770|
+----------+--------------+



In [39]:
# 8. Identify Users Who Did Different Types of Workouts
users_diff_workouts = fitness_df.groupBy("user_id").agg(count("workout_type").alias("workout_types_count")).filter(col("workout_types_count") > 1)
users_diff_workouts.show()

+-------+-------------------+
|user_id|workout_types_count|
+-------+-------------------+
|      1|                  3|
|      3|                  3|
|      2|                  3|
+-------+-------------------+



In [40]:
# 9. Calculate the Total Number of Workouts per User
total_workouts_per_user = fitness_df.groupBy("user_id").count().alias("total_workouts")
total_workouts_per_user.show()

+-------+-----+
|user_id|count|
+-------+-----+
|      1|    3|
|      3|    3|
|      2|    3|
+-------+-----+



In [41]:
# 10. Create a New Column for "Active" Days
fitness_df = fitness_df.withColumn("Active_day", expr("CASE WHEN steps > 10000 THEN 'Active' ELSE 'Inactive' END"))
fitness_df.show()

+-------+----------+-----+---------------+--------------+------------+----------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|Active_day|
+-------+----------+-----+---------------+--------------+------------+----------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|    Active|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|  Inactive|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|    Active|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|  Inactive|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|  Inactive|
|      3|2023-09-02|14000|            600|           7.5|    Strength|    Active|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|    Active|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|    Active|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|    Active|
+-------+-------

# **MUSIC STREAMING DATA**

In [42]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Music Streaming Data").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, "Blinding Lights", "The Weeknd", 200, "2023-09-01 08:15:00", "New York"),
    (2, "Shape of You", "Ed Sheeran", 240, "2023-09-01 09:20:00", "Los Angeles"),
    (3, "Levitating", "Dua Lipa", 180, "2023-09-01 10:30:00", "London"),
    (1, "Starboy", "The Weeknd", 220, "2023-09-01 11:00:00", "New York"),
    (2, "Perfect", "Ed Sheeran", 250, "2023-09-01 12:15:00", "Los Angeles"),
    (3, "Don't Start Now", "Dua Lipa", 200, "2023-09-02 08:10:00", "London"),
    (1, "Save Your Tears", "The Weeknd", 210, "2023-09-02 09:00:00", "New York"),
    (2, "Galway Girl", "Ed Sheeran", 190, "2023-09-02 10:00:00", "Los Angeles"),
    (3, "New Rules", "Dua Lipa", 230, "2023-09-02 11:00:00", "London")
]
columns = ["user_id", "song_title", "artist", "duration_seconds", "streaming_time", "location"]

# Create DataFrame
music_df = spark.createDataFrame(data, columns)
music_df.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2

In [43]:
# 1. Calculate the Total Listening Time for Each User
total_listening_time_per_user = music_df.groupBy("user_id").agg(sum("duration_seconds").alias("total_listening_time"))
total_listening_time_per_user.show()

+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+



In [44]:
# 2. Filter Songs Streamed for More Than 200 Seconds
songs_gt_200_seconds = music_df.filter(col("duration_seconds") > 200)
songs_gt_200_seconds.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+



In [45]:
# 3. Find the Most Popular Artist
most_popular_artist = music_df.groupBy("artist").agg(count("song_title").alias("total_streams")).orderBy(col("total_streams").desc()).limit(1)
most_popular_artist.show()

+--------+-------------+
|  artist|total_streams|
+--------+-------------+
|Dua Lipa|            3|
+--------+-------------+



In [46]:
# 4. Identify the Song with the Longest Duration
longest_duration_song = music_df.orderBy(col("duration_seconds").desc()).limit(1)
longest_duration_song.show()

+-------+----------+----------+----------------+-------------------+-----------+
|user_id|song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+----------+----------+----------------+-------------------+-----------+
|      2|   Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
+-------+----------+----------+----------------+-------------------+-----------+



In [47]:
# 5. Calculate the Average Song Duration by Artist
avg_duration_per_artist = music_df.groupBy("artist").agg(avg("duration_seconds").alias("avg_duration"))
avg_duration_per_artist.show()

+----------+------------------+
|    artist|      avg_duration|
+----------+------------------+
|  Dua Lipa|203.33333333333334|
|Ed Sheeran|226.66666666666666|
|The Weeknd|             210.0|
+----------+------------------+



In [48]:
# 6. Find the Top 3 Most Streamed Songs per User
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_fun = Window.partitionBy("user_id").orderBy(col("count").desc())
top_songs_per_user = music_df.groupBy("user_id", "song_title").agg(count("song_title").alias("count")).withColumn("rank", rank().over(window_fun)).filter(col("rank") <= 3)
top_songs_per_user.show()

+-------+---------------+-----+----+
|user_id|     song_title|count|rank|
+-------+---------------+-----+----+
|      1|Blinding Lights|    1|   1|
|      1|        Starboy|    1|   1|
|      1|Save Your Tears|    1|   1|
|      2|   Shape of You|    1|   1|
|      2|    Galway Girl|    1|   1|
|      2|        Perfect|    1|   1|
|      3|     Levitating|    1|   1|
|      3|      New Rules|    1|   1|
|      3|Don't Start Now|    1|   1|
+-------+---------------+-----+----+



In [49]:
# 7. Calculate the Total Number of Streams per Day
from pyspark.sql.functions import dayofmonth

total_streams_per_day = music_df.groupBy(dayofmonth("streaming_time").alias("date")).agg(count("song_title").alias("total_streams"))
total_streams_per_day.show()

+----+-------------+
|date|total_streams|
+----+-------------+
|   1|            5|
|   2|            4|
+----+-------------+



In [50]:
# 8. Identify Users Who Streamed Songs from More Than One Artist
users_diff_artists = music_df.groupBy("user_id").agg(count("artist").alias("unique_artists")).filter(col("unique_artists") > 1)
users_diff_artists.show()

+-------+--------------+
|user_id|unique_artists|
+-------+--------------+
|      1|             3|
|      3|             3|
|      2|             3|
+-------+--------------+



In [51]:
# 9. Calculate the Total Streams for Each Location
total_streams_per_location = music_df.groupBy("location").agg(count("song_title").alias("total_streams"))
total_streams_per_location.show()

+-----------+-------------+
|   location|total_streams|
+-----------+-------------+
|Los Angeles|            3|
|     London|            3|
|   New York|            3|
+-----------+-------------+



In [52]:
# 10. Create a New Column to Classify Long and Short Songs
music_df = music_df.withColumn("song_length", expr("CASE WHEN duration_seconds > 200 THEN 'Long' ELSE 'Short' END"))
music_df.show()

+-------+---------------+----------+----------------+-------------------+-----------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|song_length|
+-------+---------------+----------+----------------+-------------------+-----------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|      Short|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|       Long|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|      Short|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|       Long|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|       Long|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|      Short|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|       Long|
|      2|    Galway 

# **RETAIL STORE SALES DATA**

In [53]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Retail Store Sales Data").getOrCreate()

# Sample Data with Alternate Column Names
data = [
    (1, "Apple", "Groceries", 0.50, 10, "2023-09-01"),
    (2, "T-shirt", "Clothing", 15.00, 2, "2023-09-01"),
    (3, "Notebook", "Stationery", 2.00, 5, "2023-09-02"),
    (4, "Banana", "Groceries", 0.30, 12, "2023-09-02"),
    (5, "Laptop", "Electronics", 800.00, 1, "2023-09-03"),
    (6, "Pants", "Clothing", 25.00, 3, "2023-09-03"),
    (7, "Headphones", "Electronics", 100.00, 2, "2023-09-04"),
    (8, "Pen", "Stationery", 1.00, 10, "2023-09-04"),
    (9, "Orange", "Groceries", 0.60, 8, "2023-09-05"),
    (10, "Sneakers", "Clothing", 50.00, 1, "2023-09-05")
]
columns = ["transaction_id", "product_name", "category", "price", "quantity", "sales_date"]

# Create DataFrame
retail_df = spark.createDataFrame(data, columns)
retail_df.show()

+--------------+------------+-----------+-----+--------+----------+
|transaction_id|product_name|   category|price|quantity|sales_date|
+--------------+------------+-----------+-----+--------+----------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|
|             8|         Pen| Stationery|  1.0|      10|2023-09-04|
|             9|      Orange|  Groceries|  0.6|       8|2023-09-05|
|            10|    Sneakers|   Clothing| 50.0|       1|2023-09-05|
+--------------+------------+-----------+-----+--------+----------+



In [54]:
# 1. Calculate the Total Revenue per Category
retail_df = retail_df.withColumn("total_revenue", col("price") * col("quantity"))
total_revenue_per_category = retail_df.groupBy("category").agg(sum("total_revenue").alias("total_revenue"))
total_revenue_per_category.show()

+-----------+------------------+
|   category|     total_revenue|
+-----------+------------------+
| Stationery|              20.0|
|  Groceries|13.399999999999999|
|Electronics|            1000.0|
|   Clothing|             155.0|
+-----------+------------------+



In [55]:
# 2. Filter Transactions Where the Total Sales Amount is Greater Than $100
retail_df = retail_df.withColumn("total_sales_amount", col("price") * col("quantity"))
transactions_gt_100 = retail_df.filter(col("total_sales_amount") > 100)
transactions_gt_100.show()

+--------------+------------+-----------+-----+--------+----------+-------------+------------------+
|transaction_id|product_name|   category|price|quantity|sales_date|total_revenue|total_sales_amount|
+--------------+------------+-----------+-----+--------+----------+-------------+------------------+
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|        800.0|             800.0|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|        200.0|             200.0|
+--------------+------------+-----------+-----+--------+----------+-------------+------------------+



In [56]:
# 3. Find the Most Sold Product
most_sold_product = retail_df.groupBy("product_name").agg(sum("quantity").alias("total_quantity")).orderBy(col("total_quantity").desc()).limit(1)
most_sold_product.show()

+------------+--------------+
|product_name|total_quantity|
+------------+--------------+
|      Banana|            12|
+------------+--------------+



In [58]:
# 4. Calculate the Average Price per Product Category
avg_price_per_category = retail_df.groupBy("category").agg(avg("price").alias("avg_price"))
avg_price_per_category.show()

+-----------+------------------+
|   category|         avg_price|
+-----------+------------------+
| Stationery|               1.5|
|  Groceries|0.4666666666666666|
|Electronics|             450.0|
|   Clothing|              30.0|
+-----------+------------------+



In [59]:
# 5. Find the Top 3 Highest Grossing Products
top_3_grossing_products = retail_df.groupBy("product_name").agg(sum("total_revenue").alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(3)
top_3_grossing_products.show()

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|      Laptop|        800.0|
|  Headphones|        200.0|
|       Pants|         75.0|
+------------+-------------+



In [60]:
# 6. Calculate the Total Number of Items Sold per Day
total_items_sold_per_day = retail_df.groupBy("sales_date").agg(sum("quantity").alias("total_items_sold"))
total_items_sold_per_day.show()

+----------+----------------+
|sales_date|total_items_sold|
+----------+----------------+
|2023-09-01|              12|
|2023-09-02|              17|
|2023-09-03|               4|
|2023-09-05|               9|
|2023-09-04|              12|
+----------+----------------+



In [64]:
# 7. Identify the Product with the Lowest Price in Each Category
lowest_price_per_category = retail_df.groupBy("category").agg(min("price").alias("price"))
product_lowest_price_per_category = retail_df.join(lowest_price_per_category, ["category", "price"])
product_lowest_price_per_category.show()

+-----------+-----+--------------+------------+--------+----------+------------------+------------------+
|   category|price|transaction_id|product_name|quantity|sales_date|     total_revenue|total_sales_amount|
+-----------+-----+--------------+------------+--------+----------+------------------+------------------+
|   Clothing| 15.0|             2|     T-shirt|       2|2023-09-01|              30.0|              30.0|
|  Groceries|  0.3|             4|      Banana|      12|2023-09-02|3.5999999999999996|3.5999999999999996|
|Electronics|100.0|             7|  Headphones|       2|2023-09-04|             200.0|             200.0|
| Stationery|  1.0|             8|         Pen|      10|2023-09-04|              10.0|              10.0|
+-----------+-----+--------------+------------+--------+----------+------------------+------------------+



In [65]:
# 8. Calculate the Total Revenue for Each Product
total_revenue_per_product = retail_df.groupBy("product_name").agg(sum("total_revenue").alias("total_revenue"))
total_revenue_per_product.show()

+------------+------------------+
|product_name|     total_revenue|
+------------+------------------+
|     T-shirt|              30.0|
|      Banana|3.5999999999999996|
|      Laptop|             800.0|
|    Notebook|              10.0|
|       Apple|               5.0|
|    Sneakers|              50.0|
|      Orange|               4.8|
|         Pen|              10.0|
|       Pants|              75.0|
|  Headphones|             200.0|
+------------+------------------+



In [66]:
# 9. Find the Total Sales per Day for Each Category
total_sales_per_day_per_category = retail_df.groupBy("sales_date", "category").agg(sum("total_sales_amount").alias("total_sales"))
total_sales_per_day_per_category.show()

+----------+-----------+------------------+
|sales_date|   category|       total_sales|
+----------+-----------+------------------+
|2023-09-01|  Groceries|               5.0|
|2023-09-02|  Groceries|3.5999999999999996|
|2023-09-01|   Clothing|              30.0|
|2023-09-02| Stationery|              10.0|
|2023-09-03|Electronics|             800.0|
|2023-09-05|  Groceries|               4.8|
|2023-09-04| Stationery|              10.0|
|2023-09-04|Electronics|             200.0|
|2023-09-03|   Clothing|              75.0|
|2023-09-05|   Clothing|              50.0|
+----------+-----------+------------------+



In [67]:
# 10. Create a New Column for Discounted Price
retail_df = retail_df.withColumn("discounted_price", col("price") * 0.9)
retail_df.show()


+--------------+------------+-----------+-----+--------+----------+------------------+------------------+----------------+
|transaction_id|product_name|   category|price|quantity|sales_date|     total_revenue|total_sales_amount|discounted_price|
+--------------+------------+-----------+-----+--------+----------+------------------+------------------+----------------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|               5.0|               5.0|            0.45|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|              30.0|              30.0|            13.5|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|              10.0|              10.0|             1.8|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|3.5999999999999996|3.5999999999999996|            0.27|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|             800.0|             800.0|           720.0|
|             6|