In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySparkTest').getOrCreate()

from IPython.display import display, HTML
display(HTML("<style>.jp-Cell-outputArea { text-align: center; }</style>"))

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/11 01:06:08 WARN Utils: Your hostname, N0L144853, resolves to a loopback address: 127.0.1.1; using 192.168.68.107 instead (on interface wlp0s20f3)
25/08/11 01:06:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/11 01:06:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/11 01:06:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/11 01:06:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/08/11 01:06:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/08/11 01:06:09 WARN Util

In [2]:
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import functions as F

TOP_N_RECOMMENDATIONS = 3
TOP_N_CLUSTER_RECOMMENDATIONS = 3
TOP_N_OFFERS = 2
THRESH = 0.9

# Read Files

In [3]:
path_rf_predictions = '../data/trusted/rf_output_model'
path_km_predictions = '../data/trusted/km_output_model'
path_freq_bayes = '../data/trusted/freq_bayes'
path_output_allocation = "../data/refined/allocation"

df_rf_predictions = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_rf_predictions)
df_km_predictions = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_km_predictions)
df_freq_bayes = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_freq_bayes)

In [4]:
df_freq_bayes.show(5)

+--------------------+-------------+--------------------+-------------------+----------------+----------+
|           client_id|   offer_type|            offer_id|qtd_offer_completed|offer_type_count|freq_bayes|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
|0009655768c64bdeb...|     discount|2906b810c7d441179...|                  1|               2|       0.5|
|0009655768c64bdeb...|informational|3f207df678b143eea...|                  0|               2|       0.0|
|0009655768c64bdeb...|informational|5a8bc65990b245e5a...|                  0|               2|       0.0|
|0009655768c64bdeb...|         bogo|f19421c1d4aa40978...|                  1|               1|       1.0|
|0009655768c64bdeb...|     discount|fafdcd668e3743c1b...|                  1|               2|       0.5|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
only showing top 5 rows


In [5]:
df_rf_predictions.select('client_id', 'offer_id').distinct().count(), df_freq_bayes.select('client_id', 'offer_id').distinct().count()

(63288, 63288)

### Gerando Score de Propensão e Recomendações

In [None]:
df_offer_score = df_rf_predictions.join(df_freq_bayes, on=['client_id', 'offer_id'], how='inner')\
                           .withColumn("offer_score", F.col("freq_bayes")*F.col("y_proba_class1"))\
                           .withColumn("ranking", F.row_number().over(Window.partitionBy("client_id")\
                                                                            .orderBy(F.col("offer_score").desc(), 
                                                                                     F.col("y_proba_class1").desc(),
                                                                                     F.col("freq_bayes").desc())))\
                           .select('client_id', 'offer_id', 'offer_type', 'freq_bayes', 'y_proba_class1', 'offer_score', 'ranking')\
                           .orderBy('client_id', 'ranking')
df_offer_score.show(25, truncate=False)

In [None]:
df_recommended_offers = df_offer_score.filter(F.col("ranking") <= TOP_N_RECOMMENDATIONS)\
                                       .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id","offer_type","offer_score"))).alias("items_sorted"))\
                                       .withColumn("recommended_offers", F.expr("transform(items_sorted, x -> x.offer_id)"))\
                                       .drop("items_sorted")\
                                       .orderBy('client_id')
df_recommended_offers.show(truncate=False)

In [None]:
# df_ranking = df_top.groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id","offer_type","offer_score"))).alias("items_sorted"))\
#                    .withColumn("recommended_offers", F.expr("transform(items_sorted, x -> x.offer_id)"))\
#                    .drop("items_sorted")\
#                    .orderBy('client_id')
# df_ranking.show(truncate=False)

### Gerando Recomendações Intra-Cluster

In [None]:
df_km_predictions.show(5)

In [None]:
df_clusters = df_km_predictions.select("client_id", "cluster").distinct()
df_clusters.show(5)

In [None]:
df_cluster_offers = df_freq_bayes.select("client_id", "offer_id").distinct()\
                                 .join(df_clusters, on="client_id", how="inner")
df_cluster_offers.show(5, truncate=False)

In [None]:
df_cluster_popular = df_cluster_offers.groupBy("cluster", "offer_id").count()\
                                      .withColumnRenamed("count", "qtd_clientes")
df_cluster_popular.show(5, truncate=False)

In [None]:
recs_cluster = df_clusters.join(df_cluster_popular, on="cluster", how="inner")\
                          .join(df_freq_bayes.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                          .withColumn("ranking", F.row_number().over(Window.partitionBy("client_id")\
                                                                           .orderBy(F.col("qtd_clientes").desc())))\
                          .distinct().orderBy('client_id', "cluster", F.desc("qtd_clientes"))\
                          .filter(F.col("ranking") <= TOP_N_CLUSTER_RECOMMENDATIONS)
recs_cluster.show(truncate=False)

In [None]:
df_recommended_cluster_offers = recs_cluster.join(df_offer_score.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                            .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id"))).alias("cluster_offer_sorted"))\
                                            .withColumn("recommended_cluster_offers", F.expr("transform(cluster_offer_sorted, x -> x.offer_id)"))\
                                            .drop("cluster_offer_sorted")\
                                            .orderBy('client_id')
df_recommended_cluster_offers.show(truncate=False)

### Pegando as Top Ofertas mais adiquiridas pelos clientes por tipo de oferta e recomendando para complementar o pacote de recomendações

In [None]:
df_freq_bayes.show(5)

In [None]:
popular_offers = df_freq_bayes.groupBy("offer_id", "offer_type").agg(F.sum("qtd_offer_completed").alias("qtd_offer_completed"))\
                              .filter("qtd_offer_completed > 0")\
                              .withColumn("ranking", F.row_number().over(Window.partitionBy("offer_type")\
                                                                               .orderBy(F.col("qtd_offer_completed").desc())))\
                              .filter(F.col("ranking") <= TOP_N_OFFERS)

popular_offers.show(truncate=False)

In [None]:
recs_popular_offers = df_offer_score.filter(F.col("y_proba_class1") >= THRESH)\
                                    .select("client_id", "offer_type").distinct()\
                                    .join(popular_offers.select("offer_id", "offer_type", "ranking"), on='offer_type', how="inner")\
                                    .orderBy('client_id')

recs_popular_offers.show(5, truncate=False)

In [None]:
df_recommended_popular_offers = recs_popular_offers.join(df_offer_score.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                                   .join(recs_cluster.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                                   .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id"))).alias("popular_offer_sorted"))\
                                                   .withColumn("recommended_popular_offers", F.expr("transform(popular_offer_sorted, x -> x.offer_id)"))\
                                                   .drop("popular_offer_sorted")\
                                                   .orderBy('client_id')

df_recommended_popular_offers.show(truncate=False)

### Juntando recomendações vindas dos Scores, com recomendações de popularidade e Recomendações IntraCluster

In [None]:
df_allocation = df_recommended_offers.join(df_recommended_cluster_offers, on='client_id', how='left')\
                                     .join(df_recommended_popular_offers, on='client_id', how='left')\
                                     .withColumn("recommended_popular_offers", F.coalesce(F.col("recommended_popular_offers"), F.array().cast(T.ArrayType(T.StringType()))))\
                                     .withColumn("recommended_cluster_offers", F.coalesce(F.col("recommended_cluster_offers"), F.array().cast(T.ArrayType(T.StringType()))))\
                                     .orderBy('client_id')
df_allocation.show()

In [None]:
df_allocation.write.format("parquet").mode("overwrite").save(path_output_allocation)