In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("CustomerBehaviorAnalysis").getOrCreate()

file_path = "/data/Mall_Customers.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

df.show(5)

+----------+------+---+------------------+----------------------+
|CustomerID|Gender|Age|Annual Income (k$)|Spending Score (1-100)|
+----------+------+---+------------------+----------------------+
|         1|  Male| 19|                15|                    39|
|         2|  Male| 21|                15|                    81|
|         3|Female| 20|                16|                     6|
|         4|Female| 23|                16|                    77|
|         5|Female| 31|                17|                    40|
+----------+------+---+------------------+----------------------+
only showing top 5 rows



### 1 - revenu moyen par âge, trié par ordre décroissant

In [2]:
income_per_age_average = df.groupBy("Age") \
    .agg(F.round(F.avg("Annual Income (k$)"),2).alias("Average_Income")) \
    .sort(F.desc("Average_Income"))

income_per_age_average.show()

+---+--------------+
|Age|Average_Income|
+---+--------------+
| 41|         101.0|
| 32|         87.18|
| 28|         85.25|
| 36|          81.0|
| 33|         80.33|
| 34|          79.0|
| 56|          79.0|
| 30|         76.14|
| 44|          75.5|
| 38|          74.5|
| 39|         72.67|
| 47|         70.17|
| 45|         69.33|
| 43|         65.67|
| 59|         65.25|
| 37|          65.0|
| 57|          64.5|
| 29|          63.6|
| 27|         63.17|
| 66|          63.0|
+---+--------------+
only showing top 20 rows



### 2 - Score de dépenses moyen pour les clients de moins de 25 ans

In [3]:
youth_spending_score = df.filter(df.Age < 25) \
    .agg(F.round(F.avg("Spending Score (1-100)"),2).alias("avg_spending_score_under_25"))

youth_spending_score.show()

+---------------------------+
|avg_spending_score_under_25|
+---------------------------+
|                      56.26|
+---------------------------+



### 3 - Analyse par groupe d'âge

In [6]:
age_group_df = df.withColumn("Age_Group", 
    F.when(df["Age"] < 25, "-25")
    .when((df["Age"] >= 25) & (df["Age"] < 40), "25-39")
    .when((df["Age"] >= 40) & (df["Age"] < 60), "40-59")
    .otherwise("60+"))

age_group_df.show()

+----------+------+---+------------------+----------------------+---------+
|CustomerID|Gender|Age|Annual Income (k$)|Spending Score (1-100)|Age_Group|
+----------+------+---+------------------+----------------------+---------+
|         1|  Male| 19|                15|                    39|      -25|
|         2|  Male| 21|                15|                    81|      -25|
|         3|Female| 20|                16|                     6|      -25|
|         4|Female| 23|                16|                    77|      -25|
|         5|Female| 31|                17|                    40|    25-39|
|         6|Female| 22|                17|                    76|      -25|
|         7|Female| 35|                18|                     6|    25-39|
|         8|Female| 23|                18|                    94|      -25|
|         9|  Male| 64|                19|                     3|      60+|
|        10|Female| 30|                19|                    72|    25-39|
|        11|

In [7]:
avg_spending_score_per_age_group = age_group_df.groupBy("Age_Group") \
    .agg(F.round(F.avg("Spending Score (1-100)"),2).alias("avg_spending_score_per_age_group")) \
    .sort("Age_Group")

avg_spending_score_per_age_group.show()

+---------+--------------------------------+
|Age_Group|avg_spending_score_per_age_group|
+---------+--------------------------------+
|      -25|                           56.26|
|    25-39|                           61.48|
|    40-59|                           34.86|
|      60+|                            43.0|
+---------+--------------------------------+



In [8]:
# Arrêt de la SparkSession
spark.stop()