In [2]:
# Configuracion de spark

import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

app_name = "Avance 2"

spark = SparkSession.builder \
    .appName(app_name) \
    .config("spark.jars", "/spark/jars/postgresql.jar") \
    .getOrCreate()

app_name = spark.conf.get("spark.app.name")
spark_context = spark.sparkContext.master

In [3]:
# Creacion del schema

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("CustomerID", DoubleType(), True),
    StructField("Gender", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Tenure_Months", DoubleType(), True),
    StructField("Transaction_ID", DoubleType(), True),
    StructField("Transaction_Date", DateType(), True),
    StructField("Product_SKU", StringType(), True),
    StructField("Product_Description", StringType(), True),
    StructField("Product_Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Avg_Price", DoubleType(), True),
    StructField("Delivery_Charges", DoubleType(), True),
    StructField("Coupon_Status", StringType(), True),
    StructField("GST", DoubleType(), True),
    StructField("Date", DateType(), True),
    StructField("Offline_Spend", DoubleType(), True),
    StructField("Online_Spend", DoubleType(), True),
    StructField("Month", IntegerType(), True),
    StructField("Coupon_Code", StringType(), True),
    StructField("Discount_pct", DoubleType(), True)
])

In [5]:
# Cargar los datos
archivo_csv = '/app/onlineShoppingDataset/file.csv'
online_shopping = spark.read.csv(archivo_csv, header=True, schema=schema)

print(online_shopping.show())

+---+----------+------+--------+-------------+--------------+----------------+--------------+--------------------+----------------+--------+---------+----------------+-------------+---+----+-------------+------------+-----+-----------+------------+
| id|CustomerID|Gender|Location|Tenure_Months|Transaction_ID|Transaction_Date|   Product_SKU| Product_Description|Product_Category|Quantity|Avg_Price|Delivery_Charges|Coupon_Status|GST|Date|Offline_Spend|Online_Spend|Month|Coupon_Code|Discount_pct|
+---+----------+------+--------+-------------+--------------+----------------+--------------+--------------------+----------------+--------+---------+----------------+-------------+---+----+-------------+------------+-----+-----------+------------+
|  0|   17850.0|     M| Chicago|         12.0|       16679.0|      2019-01-01|GGOENEBJ079499|Nest Learning The...|        Nest-USA|    null|   153.71|             6.5|         Used|0.1|null|       4500.0|      2424.5|    1|     ELEC10|        10.0|
|  1

In [31]:
# Convertir columnas categóricas en numéricas
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

cat_cols = [t[0] for t in online_shopping.dtypes if t[1] == 'string']
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid='keep').fit(online_shopping) 
    for column in cat_cols
]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(online_shopping).transform(online_shopping)

print(df_r.show())

+---+----------+------+--------+-------------+--------------+----------------+--------------+--------------------+----------------+--------+---------+----------------+-------------+---+----+-------------+------------+-----+-----------+------------+------------+--------------+-----------------+-------------------------+----------------------+-------------------+-----------------+
| id|CustomerID|Gender|Location|Tenure_Months|Transaction_ID|Transaction_Date|   Product_SKU| Product_Description|Product_Category|Quantity|Avg_Price|Delivery_Charges|Coupon_Status|GST|Date|Offline_Spend|Online_Spend|Month|Coupon_Code|Discount_pct|Gender_index|Location_index|Product_SKU_index|Product_Description_index|Product_Category_index|Coupon_Status_index|Coupon_Code_index|
+---+----------+------+--------+-------------+--------------+----------------+--------------+--------------------+----------------+--------+---------+----------------+-------------+---+----+-------------+------------+-----+-----------+-

In [32]:
# Seleccionar columnas relevantes para la segmentación

feature_cols = num_cols + [c + "_index" for c in cat_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [33]:
# Como tenemos nullos rellenamos los datos

df_r = df_r.na.fill(0)

In [34]:
# Implementación de K-means

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [35]:
# Configurar el algoritmo K-means

kmeans = KMeans(featuresCol='scaledFeatures', k=5)

In [36]:
pipeline = Pipeline(stages=[assembler, scaler, kmeans])

model = pipeline.fit(df_r)

23/11/22 23:17:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/22 23:17:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [39]:
# Evaluación de Resultados

centers = model.stages[-1].clusterCenters()
print("Cluster Centers: ")

for center in centers:
    print(center)

Cluster Centers: 
[1.68256575 8.70543915 1.95189859 3.5938249  0.         0.37136288
 0.52772433 3.85017407 3.00634136 2.30955414 1.84232409 2.55803353
 0.76998777 1.01328131 2.4953124  2.22837979 0.1186276  0.8807127
 0.24662745]
[1.34301307 8.64021451 1.96243941 4.84882441 0.         1.76345879
 0.40555289 2.13157738 3.52513156 2.72410166 3.08083792 2.95871068
 0.78639398 1.01042389 0.11220683 0.19841379 0.68581403 0.88241591
 0.89473608]
[2.17783739 8.58408635 1.93596861 3.9450754  0.         0.24689608
 0.53706838 3.66667716 2.80345    2.22294641 2.20936042 2.21621966
 0.76369379 1.04264499 0.44726852 0.53743868 0.6028304  0.87899969
 0.64583185]
[1.40749665 8.86334338 1.92547572 2.67255763 0.         1.11863763
 0.63256684 2.61848888 2.90623719 2.21331211 0.90787218 2.30212642
 0.78111882 1.05009288 0.16859103 0.32320246 0.52359669 0.87939384
 0.70713469]
[2.17609539 8.67130377 1.89932614 3.64232437 0.         0.46639323
 0.67106134 2.00619434 2.94829211 2.3007182  1.89747695 2.41

In [41]:
# Transformar los datos para obtener las etiquetas de cluster

result = model.transform(df_r)
result.select('CustomerID', 'scaledFeatures', 'prediction').show()

+----------+--------------------+----------+
|CustomerID|      scaledFeatures|prediction|
+----------+--------------------+----------+
|   17850.0|[0.0,10.107365352...|         3|
|   17850.0|[6.54153320295751...|         3|
|   17850.0|[1.30830664059150...|         3|
|   17850.0|[1.96245996088725...|         3|
|   17850.0|[2.61661328118300...|         3|
|   17850.0|[3.27076660147875...|         3|
|   17850.0|[3.92491992177450...|         3|
|   17850.0|[4.57907324207025...|         3|
|   17850.0|[5.23322656236601...|         3|
|   17850.0|[5.88737988266176...|         3|
|   17850.0|[6.54153320295751...|         3|
|   17850.0|[7.19568652325326...|         3|
|   17850.0|[7.84983984354901...|         3|
|   17850.0|[8.50399316384476...|         3|
|   17850.0|[9.15814648414051...|         3|
|   17850.0|[9.81229980443627...|         3|
|   17850.0|[0.00104664531247...|         3|
|   17850.0|[0.00111206064450...|         3|
|   17850.0|[0.00117747597653...|         3|
|   17850.

In [42]:
# Escogemos un segmento

segment = result.filter(result['prediction'] == 0)

In [43]:
# Analizar características comunes

avg_values = segment.groupBy('prediction').mean('Tenure_Months', 'Avg_Price')
avg_values.show()

+----------+------------------+-----------------+
|prediction|avg(Tenure_Months)|   avg(Avg_Price)|
+----------+------------------+-----------------+
|         0| 26.29596223769822|23.76233465770619|
+----------+------------------+-----------------+



In [44]:
# Contar las categorías más comunes para una variable categórica

common_categories = segment.groupBy('Product_Category').count().orderBy('count', ascending=False)
common_categories.show()

+--------------------+-----+
|    Product_Category|count|
+--------------------+-----+
|             Apparel| 7705|
|           Drinkware|  374|
|              Office|  338|
|           Lifestyle|  161|
|                Bags|   58|
|            Nest-USA|   36|
|                Nest|   14|
|Notebooks & Journals|    1|
+--------------------+-----+



In [52]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import collect_set

# Transformar los datos en un formato adecuado para FPGrowth

transactions = segment.groupBy("Transaction_ID").agg(collect_set("Product_Description").alias("items"))


In [53]:
# Aplicar FPGrowth

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.1)
model = fpGrowth.fit(transactions)

In [54]:
# Mostrar conjuntos de elementos frecuentes

frequent_itemsets = model.freqItemsets
frequent_itemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [55]:
# Generar reglas de asociación

association_rules = model.associationRules
association_rules.show()

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+



In [56]:
association_rules.orderBy("lift", ascending=False).show()

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

