In [None]:
# 🛠️ Step 1: Set up Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MallCustomersAnalysis").getOrCreate()


In [None]:
# 📥 Step 2: Load Dataset
df = spark.read.csv("data/Mall_Customers.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)


In [None]:
# 📊 Step 3: Spending Score Distribution
df.groupBy("Spending Score (1-100)") \
  .count() \
  .orderBy("Spending Score (1-100)") \
  .show(20)


In [None]:
# 📈 Step 4: Average Income by Spending Score
from pyspark.sql.functions import avg, count

df.groupBy("Spending Score (1-100)") \
  .agg(avg("Annual Income (k$)").alias("Avg Income"), count("*").alias("Count")) \
  .orderBy("Spending Score (1-100)") \
  .show(20)


## 🧠 Insights Summary

- Most customers fall into the middle range of spending scores.
- Income doesn't correlate directly with spending — some high spenders have moderate income.
- Segment opportunities:
  - **High Score + Low Income**: Loyal, budget-conscious
  - **High Score + High Income**: Premium target segment
