In [2]:
# Configuracion de spark

import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator


app_name = "Avance 1"

spark = SparkSession.builder \
    .appName(app_name) \
    .config("spark.jars", "/spark/jars/postgresql.jar") \
    .getOrCreate()

app_name = spark.conf.get("spark.app.name")
spark_context = spark.sparkContext.master


In [3]:
# Cargar los datos
archivo_csv = '/app/Balaji Fast Food Sales.csv'
data_frame_balaji = spark.read.csv(archivo_csv, header=True, inferSchema=True)

print(data_frame_balaji.show())

+--------+----------+---------------+---------+----------+--------+------------------+----------------+-----------+------------+
|order_id|      date|      item_name|item_type|item_price|quantity|transaction_amount|transaction_type|received_by|time_of_sale|
+--------+----------+---------------+---------+----------+--------+------------------+----------------+-----------+------------+
|       1|07-03-2022|       Aalopuri| Fastfood|        20|      13|               260|            null|        Mr.|       Night|
|       2| 8/23/2022|        Vadapav| Fastfood|        20|      15|               300|            Cash|        Mr.|   Afternoon|
|       3|11/20/2022|        Vadapav| Fastfood|        20|       1|                20|            Cash|        Mr.|   Afternoon|
|       4|02-03-2023|Sugarcane juice|Beverages|        25|       6|               150|          Online|        Mr.|       Night|
|       5|10-02-2022|Sugarcane juice|Beverages|        25|       8|               200|          O

In [11]:
# Agrupar por order_id para obtener el total gastado y la cantidad total de artículos

# Falto esta importacion
from pyspark.sql.functions import sum

df_grouped = data_frame_balaji.groupBy('order_id').agg(
    sum('transaction_amount').alias('total_spent'),
    sum('quantity').alias('total_items')
)

print(df_grouped.show())

+--------+-----------+-----------+
|order_id|total_spent|total_items|
+--------+-----------+-----------+
|     148|        780|         13|
|     463|        360|          6|
|     471|        140|          7|
|     496|        400|         10|
|     833|         40|          2|
|     243|        100|          5|
|     392|        300|         15|
|     540|        260|         13|
|     623|        600|         15|
|     737|        650|         13|
|     858|        360|          9|
|     897|        120|          6|
|      31|        520|         13|
|     516|        300|         12|
|      85|         60|          3|
|     137|        150|          6|
|     251|        520|         13|
|     451|        250|         10|
|     580|        200|         10|
|     808|         20|          1|
+--------+-----------+-----------+
only showing top 20 rows

None


In [8]:
# Selección de características para el modelo K-means

feature_columns = ['total_spent', 'total_items']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")


In [10]:
# Vector de características

df_kmeans = assembler.transform(df_grouped).select('order_id', 'features')
print(df_kmeans.show())


+--------+------------+
|order_id|    features|
+--------+------------+
|     148|[780.0,13.0]|
|     463| [360.0,6.0]|
|     471| [140.0,7.0]|
|     496|[400.0,10.0]|
|     833|  [40.0,2.0]|
|     243| [100.0,5.0]|
|     392|[300.0,15.0]|
|     540|[260.0,13.0]|
|     623|[600.0,15.0]|
|     737|[650.0,13.0]|
|     858| [360.0,9.0]|
|     897| [120.0,6.0]|
|      31|[520.0,13.0]|
|     516|[300.0,12.0]|
|      85|  [60.0,3.0]|
|     137| [150.0,6.0]|
|     251|[520.0,13.0]|
|     451|[250.0,10.0]|
|     580|[200.0,10.0]|
|     808|  [20.0,1.0]|
+--------+------------+
only showing top 20 rows

None


In [12]:
# Preparacion para uso del algoritmo

# Escalado de características
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
# calcula la media y la desviación estándar de cada característica en el DataFrame 
scalerModel = scaler.fit(df_kmeans)

# con la instacina de la clase StandardScaler, generamos una columna que contiene las caracterisitcas
df_scaled = scalerModel.transform(df_kmeans)

print(df_scaled.show())


+--------+------------+--------------------+
|order_id|    features|      scaledFeatures|
+--------+------------+--------------------+
|     148|[780.0,13.0]|[3.81599135777046...|
|     463| [360.0,6.0]|[1.76122678050944...|
|     471| [140.0,7.0]|[0.68492152575367...|
|     496|[400.0,10.0]|[1.95691864501049...|
|     833|  [40.0,2.0]|[0.19569186450104...|
|     243| [100.0,5.0]|[0.48922966125262...|
|     392|[300.0,15.0]|[1.46768898375787...|
|     540|[260.0,13.0]|[1.27199711925682...|
|     623|[600.0,15.0]|[2.93537796751574...|
|     737|[650.0,13.0]|[3.17999279814205...|
|     858| [360.0,9.0]|[1.76122678050944...|
|     897| [120.0,6.0]|[0.58707559350314...|
|      31|[520.0,13.0]|[2.54399423851364...|
|     516|[300.0,12.0]|[1.46768898375787...|
|      85|  [60.0,3.0]|[0.29353779675157...|
|     137| [150.0,6.0]|[0.73384449187893...|
|     251|[520.0,13.0]|[2.54399423851364...|
|     451|[250.0,10.0]|[1.22307415313155...|
|     580|[200.0,10.0]|[0.97845932250524...|
|     808|

In [16]:
# Construcción del modelo K-means

# creamos una instacia del algoritmo, diciendo que columna usar, y cuantos cluster queremos que encuentre
kmeans = KMeans(featuresCol="scaledFeatures", k=3)
model = kmeans.fit(df_scaled)

print(model)


KMeansModel: uid=KMeans_b098b4bdf54b, k=3, distanceMeasure=euclidean, numFeatures=2


In [19]:
# Predecir los clusters
predictions = model.transform(df_scaled)

print(predictions.show())


+--------+------------+--------------------+----------+
|order_id|    features|      scaledFeatures|prediction|
+--------+------------+--------------------+----------+
|     148|[780.0,13.0]|[3.81599135777046...|         2|
|     463| [360.0,6.0]|[1.76122678050944...|         0|
|     471| [140.0,7.0]|[0.68492152575367...|         1|
|     496|[400.0,10.0]|[1.95691864501049...|         0|
|     833|  [40.0,2.0]|[0.19569186450104...|         1|
|     243| [100.0,5.0]|[0.48922966125262...|         1|
|     392|[300.0,15.0]|[1.46768898375787...|         0|
|     540|[260.0,13.0]|[1.27199711925682...|         0|
|     623|[600.0,15.0]|[2.93537796751574...|         2|
|     737|[650.0,13.0]|[3.17999279814205...|         2|
|     858| [360.0,9.0]|[1.76122678050944...|         0|
|     897| [120.0,6.0]|[0.58707559350314...|         1|
|      31|[520.0,13.0]|[2.54399423851364...|         2|
|     516|[300.0,12.0]|[1.46768898375787...|         0|
|      85|  [60.0,3.0]|[0.29353779675157...|    

In [23]:
# Evaluación del modelo
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Coeficiente de silueta = " + str(silhouette))


Coeficiente de silueta = 0.5963279828555033


In [24]:
# Ver los resultados

centers = model.clusterCenters()
print("Cluster Centers: ")

for center in centers:
    print(center)


Cluster Centers: 
[1.48265555 2.41507218]
[0.51356975 0.82542895]
[3.28685999 2.94739883]
