In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySparkTest').getOrCreate()

from IPython.display import display, HTML
display(HTML("<style>.jp-Cell-outputArea { text-align: center; }</style>"))

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/10 16:38:21 WARN Utils: Your hostname, N0L144853, resolves to a loopback address: 127.0.1.1; using 192.168.3.112 instead (on interface wlp0s20f3)
25/08/10 16:38:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/10 16:38:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/10 16:38:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/10 16:38:21 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/08/10 16:38:21 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/08/10 16:38:21 WARN Utils

In [2]:
import plotly.express as px
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import functions as F

TOP_N_RECOMMENDATIONS = 3
TOP_N_CLUSTER_RECOMMENDATIONS = 3
TOP_N_OFFERS = 2
THRESH = 0.9

# Read Files

In [3]:
path_rf_predictions = '../data/trusted/rf_output_model'
path_km_predictions = '../data/trusted/km_output_model'
path_freq_bayes = '../data/trusted/freq_bayes'

df_rf_predictions = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_rf_predictions)
df_km_predictions = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_km_predictions)
df_freq_bayes = spark.read.format("parquet").option("header", "true").option("inferSchema", "true").load(path_freq_bayes)

In [4]:
df_freq_bayes.show(5)

+--------------------+-------------+--------------------+-------------------+----------------+----------+
|           client_id|   offer_type|            offer_id|qtd_offer_completed|offer_type_count|freq_bayes|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
|0009655768c64bdeb...|     discount|2906b810c7d441179...|                  1|               2|       0.5|
|0009655768c64bdeb...|informational|3f207df678b143eea...|                  0|               2|       0.0|
|0009655768c64bdeb...|informational|5a8bc65990b245e5a...|                  0|               2|       0.0|
|0009655768c64bdeb...|         bogo|f19421c1d4aa40978...|                  1|               1|       1.0|
|0009655768c64bdeb...|     discount|fafdcd668e3743c1b...|                  1|               2|       0.5|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
only showing top 5 rows


In [5]:
df_rf_predictions.select('client_id', 'offer_id').distinct().count(), df_freq_bayes.select('client_id', 'offer_id').distinct().count()

(63288, 63288)

### Gerando Score de Propensão e Recomendações

In [6]:
df_offer_score = df_rf_predictions.join(df_freq_bayes, on=['client_id', 'offer_id'], how='inner')\
                           .withColumn("offer_score", F.col("freq_bayes")*F.col("y_proba_class1"))\
                           .withColumn("ranking", F.row_number().over(Window.partitionBy("client_id")\
                                                                            .orderBy(F.col("offer_score").desc(), 
                                                                                     F.col("y_proba_class1").desc(),
                                                                                     F.col("freq_bayes").desc())))\
                           .select('client_id', 'offer_id', 'offer_type', 'freq_bayes', 'y_proba_class1', 'offer_score', 'ranking')\
                           .orderBy('client_id', 'ranking')
df_offer_score.show(25, truncate=False)

+--------------------------------+--------------------------------+-------------+------------------+--------------------+-------------------+-------+
|client_id                       |offer_id                        |offer_type   |freq_bayes        |y_proba_class1      |offer_score        |ranking|
+--------------------------------+--------------------------------+-------------+------------------+--------------------+-------------------+-------+
|0009655768c64bdeb2e877511632db8f|f19421c1d4aa40978ebb69ca19b0e20d|bogo         |1.0               |0.9036308894898399  |0.9036308894898399 |1      |
|0009655768c64bdeb2e877511632db8f|fafdcd668e3743c1bb461111dcafc2a4|discount     |0.5               |0.9306488095685482  |0.4653244047842741 |2      |
|0009655768c64bdeb2e877511632db8f|2906b810c7d4411798c6938adc9daaa5|discount     |0.5               |0.9215585153808372  |0.4607792576904186 |3      |
|0009655768c64bdeb2e877511632db8f|3f207df678b143eea3cee63160fa8bed|informational|0.0               |

In [7]:
df_recommended_offers = df_offer_score.filter(F.col("ranking") <= TOP_N_RECOMMENDATIONS)\
                                       .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id","offer_type","offer_score"))).alias("items_sorted"))\
                                       .withColumn("recommended_offers", F.expr("transform(items_sorted, x -> x.offer_id)"))\
                                       .drop("items_sorted")\
                                       .orderBy('client_id')
df_recommended_offers.show(truncate=False)

+--------------------------------+------------------------------------------------------------------------------------------------------+
|client_id                       |recommended_offers                                                                                    |
+--------------------------------+------------------------------------------------------------------------------------------------------+
|0009655768c64bdeb2e877511632db8f|[f19421c1d4aa40978ebb69ca19b0e20d, fafdcd668e3743c1bb461111dcafc2a4, 2906b810c7d4411798c6938adc9daaa5]|
|00116118485d4dfda04fdbaba9a87b5c|[f19421c1d4aa40978ebb69ca19b0e20d]                                                                    |
|0011e0d4e6b944f998e987f904e8c1e5|[9b98b8c7a33c4b65b9aebfe6a799e6d9, 2298d6c36e964ae4a3e7e9706d1fb8c2, 0b1e1539f2cc45b7b9fa7c272da2e1d7]|
|0020c2b971eb4e9188eac86d93036a77|[fafdcd668e3743c1bb461111dcafc2a4, 4d5c57ea9a6940dd891ad53e9dbe8da0, ae264e3637204a6fb9bb56bc8210ddfd]|
|0020ccbbb6d84e358d3414a3ff76cffd|

In [128]:
# df_ranking = df_top.groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id","offer_type","offer_score"))).alias("items_sorted"))\
#                    .withColumn("recommended_offers", F.expr("transform(items_sorted, x -> x.offer_id)"))\
#                    .drop("items_sorted")\
#                    .orderBy('client_id')
# df_ranking.show(truncate=False)

### Pegando as Top Ofertas mais adiquiridas pelos clientes por tipo de oferta e recomendando para complementar o pacote de recomendações

In [8]:
df_freq_bayes.show(5)

+--------------------+-------------+--------------------+-------------------+----------------+----------+
|           client_id|   offer_type|            offer_id|qtd_offer_completed|offer_type_count|freq_bayes|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
|0009655768c64bdeb...|     discount|2906b810c7d441179...|                  1|               2|       0.5|
|0009655768c64bdeb...|informational|3f207df678b143eea...|                  0|               2|       0.0|
|0009655768c64bdeb...|informational|5a8bc65990b245e5a...|                  0|               2|       0.0|
|0009655768c64bdeb...|         bogo|f19421c1d4aa40978...|                  1|               1|       1.0|
|0009655768c64bdeb...|     discount|fafdcd668e3743c1b...|                  1|               2|       0.5|
+--------------------+-------------+--------------------+-------------------+----------------+----------+
only showing top 5 rows


In [9]:
popular_offers = df_freq_bayes.groupBy("offer_id", "offer_type").agg(F.sum("qtd_offer_completed").alias("qtd_offer_completed"))\
                              .filter("qtd_offer_completed > 0")\
                              .withColumn("ranking", F.row_number().over(Window.partitionBy("offer_type")\
                                                                               .orderBy(F.col("qtd_offer_completed").desc())))\
                              .filter(F.col("ranking") <= TOP_N_OFFERS)

popular_offers.show(truncate=False)

+--------------------------------+----------+-------------------+-------+
|offer_id                        |offer_type|qtd_offer_completed|ranking|
+--------------------------------+----------+-------------------+-------+
|9b98b8c7a33c4b65b9aebfe6a799e6d9|bogo      |4354               |1      |
|f19421c1d4aa40978ebb69ca19b0e20d|bogo      |4296               |2      |
|fafdcd668e3743c1bb461111dcafc2a4|discount  |5317               |1      |
|2298d6c36e964ae4a3e7e9706d1fb8c2|discount  |5156               |2      |
+--------------------------------+----------+-------------------+-------+



In [10]:
recs_popular_offers = df_offer_score.filter(F.col("y_proba_class1") >= THRESH)\
                                    .select("client_id", "offer_type").distinct()\
                                    .join(popular_offers.select("offer_id", "offer_type", "ranking"), on='offer_type', how="inner")\
                                    .orderBy('client_id')

recs_popular_offers.show(5, truncate=False)

+----------+--------------------------------+--------------------------------+-------+
|offer_type|client_id                       |offer_id                        |ranking|
+----------+--------------------------------+--------------------------------+-------+
|bogo      |0009655768c64bdeb2e877511632db8f|f19421c1d4aa40978ebb69ca19b0e20d|2      |
|bogo      |0009655768c64bdeb2e877511632db8f|9b98b8c7a33c4b65b9aebfe6a799e6d9|1      |
|discount  |0009655768c64bdeb2e877511632db8f|fafdcd668e3743c1bb461111dcafc2a4|1      |
|discount  |0009655768c64bdeb2e877511632db8f|2298d6c36e964ae4a3e7e9706d1fb8c2|2      |
|discount  |0020c2b971eb4e9188eac86d93036a77|fafdcd668e3743c1bb461111dcafc2a4|1      |
+----------+--------------------------------+--------------------------------+-------+
only showing top 5 rows


In [11]:
df_recommended_popular_offers = recs_popular_offers.join(df_offer_score.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                                   .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id"))).alias("popular_offer_sorted"))\
                                                   .withColumn("recommended_popular_offers", F.expr("transform(popular_offer_sorted, x -> x.offer_id)"))\
                                                   .drop("popular_offer_sorted")\
                                                   .orderBy('client_id')

df_recommended_popular_offers.show(truncate=False)

+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|client_id                       |recommended_popular_offers                                                                                                              |
+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|0009655768c64bdeb2e877511632db8f|[9b98b8c7a33c4b65b9aebfe6a799e6d9, 2298d6c36e964ae4a3e7e9706d1fb8c2]                                                                    |
|0020c2b971eb4e9188eac86d93036a77|[9b98b8c7a33c4b65b9aebfe6a799e6d9, 2298d6c36e964ae4a3e7e9706d1fb8c2, f19421c1d4aa40978ebb69ca19b0e20d]                                  |
|0020ccbbb6d84e358d3414a3ff76cffd|[fafdcd668e3743c1bb461111dcafc2a4]                                                                        

### Gerando Recomendações Intra-Cluster

In [12]:
df_km_predictions.show(5)

+--------------------+--------------------+----------------+------------------+----------------+----------+-------------------------+-------------------------+--------+--------+--------+--------+---------------+-----------------+---------------+---------------+--------------------+--------------+---------------+------------+----------+------+--------------------+--------------------+-------+
|           client_id|            offer_id|offer_type_index|qtd_offer_received|qtd_offer_viewed|sum_reward|registered_duration_month|registered_duration_years|gender_M|gender_F|gender_O|gender_U|age_group_index|credit_card_limit|category_limite|offer_min_value|offer_discount_value|offer_duration|qtd_transaction|total_amount|avg_amount|target|            features|     features_scaled|cluster|
+--------------------+--------------------+----------------+------------------+----------------+----------+-------------------------+-------------------------+--------+--------+--------+--------+---------------

In [13]:
df_clusters = df_km_predictions.select("client_id", "cluster").distinct()
df_clusters.show(5)

+--------------------+-------+
|           client_id|cluster|
+--------------------+-------+
|04e39b3e8fc449cfb...|      4|
|0a3e6bcfc66445bab...|      1|
|0d24a49bc2ca4b388...|      2|
|0d74b166a5e54b269...|      1|
|0f457f3ccf254eb0a...|      1|
+--------------------+-------+
only showing top 5 rows


In [14]:
df_cluster_offers = df_freq_bayes.select("client_id", "offer_id").distinct()\
                                 .join(df_clusters, on="client_id", how="inner")
df_cluster_offers.show(5, truncate=False)

+--------------------------------+--------------------------------+-------+
|client_id                       |offer_id                        |cluster|
+--------------------------------+--------------------------------+-------+
|0222d267445f4f078bc325224e471766|2906b810c7d4411798c6938adc9daaa5|2      |
|0251876076024444864473545adce065|2298d6c36e964ae4a3e7e9706d1fb8c2|1      |
|03575a43a3da4691998de01fff617f99|f19421c1d4aa40978ebb69ca19b0e20d|0      |
|036e4bedca2045afad50fda2d3b505ab|0b1e1539f2cc45b7b9fa7c272da2e1d7|2      |
|0485b8cb0bfc4d938895bbd6d8cbd7df|2906b810c7d4411798c6938adc9daaa5|3      |
+--------------------------------+--------------------------------+-------+
only showing top 5 rows


In [15]:
df_cluster_popular = df_cluster_offers.groupBy("cluster", "offer_id").count()\
                                      .withColumnRenamed("count", "qtd_clientes")
df_cluster_popular.show(5, truncate=False)

+-------+--------------------------------+------------+
|cluster|offer_id                        |qtd_clientes|
+-------+--------------------------------+------------+
|1      |fafdcd668e3743c1bb461111dcafc2a4|1057        |
|3      |fafdcd668e3743c1bb461111dcafc2a4|782         |
|0      |2906b810c7d4411798c6938adc9daaa5|1360        |
|0      |f19421c1d4aa40978ebb69ca19b0e20d|1315        |
|1      |3f207df678b143eea3cee63160fa8bed|971         |
+-------+--------------------------------+------------+
only showing top 5 rows


In [16]:
recs_cluster = df_clusters.join(df_cluster_popular, on="cluster", how="inner")\
                          .join(df_freq_bayes.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                          .withColumn("ranking", F.row_number().over(Window.partitionBy("client_id")\
                                                                           .orderBy(F.col("qtd_clientes").desc())))\
                          .distinct().orderBy('client_id', "cluster", F.desc("qtd_clientes"))\
                          .filter(F.col("ranking") <= TOP_N_CLUSTER_RECOMMENDATIONS)
recs_cluster.show(truncate=False)

+--------------------------------+--------------------------------+-------+------------+-------+
|client_id                       |offer_id                        |cluster|qtd_clientes|ranking|
+--------------------------------+--------------------------------+-------+------------+-------+
|0009655768c64bdeb2e877511632db8f|9b98b8c7a33c4b65b9aebfe6a799e6d9|2      |1873        |1      |
|0009655768c64bdeb2e877511632db8f|0b1e1539f2cc45b7b9fa7c272da2e1d7|2      |1871        |2      |
|0009655768c64bdeb2e877511632db8f|4d5c57ea9a6940dd891ad53e9dbe8da0|2      |1868        |3      |
|00116118485d4dfda04fdbaba9a87b5c|4d5c57ea9a6940dd891ad53e9dbe8da0|3      |813         |1      |
|00116118485d4dfda04fdbaba9a87b5c|2298d6c36e964ae4a3e7e9706d1fb8c2|3      |810         |2      |
|00116118485d4dfda04fdbaba9a87b5c|ae264e3637204a6fb9bb56bc8210ddfd|3      |808         |3      |
|0011e0d4e6b944f998e987f904e8c1e5|f19421c1d4aa40978ebb69ca19b0e20d|4      |1284        |1      |
|0011e0d4e6b944f998e987f904e8c

In [17]:
df_recommended_cluster_offers = recs_cluster.join(df_offer_score.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                            .join(recs_popular_offers.select("client_id", "offer_id").distinct(), on=["client_id", "offer_id"], how="left_anti")\
                                            .groupBy("client_id").agg(F.array_sort(F.collect_list(F.struct("ranking","offer_id"))).alias("cluster_offer_sorted"))\
                                            .withColumn("recommended_cluster_offers", F.expr("transform(cluster_offer_sorted, x -> x.offer_id)"))\
                                            .drop("cluster_offer_sorted")\
                                            .orderBy('client_id')
df_recommended_cluster_offers.show(truncate=False)

+--------------------------------+------------------------------------------------------------------------------------------------------+
|client_id                       |recommended_cluster_offers                                                                            |
+--------------------------------+------------------------------------------------------------------------------------------------------+
|0009655768c64bdeb2e877511632db8f|[0b1e1539f2cc45b7b9fa7c272da2e1d7, 4d5c57ea9a6940dd891ad53e9dbe8da0]                                  |
|00116118485d4dfda04fdbaba9a87b5c|[4d5c57ea9a6940dd891ad53e9dbe8da0, 2298d6c36e964ae4a3e7e9706d1fb8c2, ae264e3637204a6fb9bb56bc8210ddfd]|
|0011e0d4e6b944f998e987f904e8c1e5|[f19421c1d4aa40978ebb69ca19b0e20d, 2906b810c7d4411798c6938adc9daaa5, ae264e3637204a6fb9bb56bc8210ddfd]|
|0020c2b971eb4e9188eac86d93036a77|[0b1e1539f2cc45b7b9fa7c272da2e1d7, 2906b810c7d4411798c6938adc9daaa5]                                  |
|0020ccbbb6d84e358d3414a3ff76cffd|

### Juntando recomendações vindas dos Scores, com recomendações de popularidade e Recomendações IntraCluster

In [18]:
df_allocation = df_recommended_offers.join(df_recommended_popular_offers, on='client_id', how='left')\
                                     .join(df_recommended_cluster_offers, on='client_id', how='left')\
                                     .withColumn("recommended_popular_offers", F.coalesce(F.col("recommended_popular_offers"), F.array().cast(T.ArrayType(T.StringType()))))\
                                     .withColumn("recommended_cluster_offers", F.coalesce(F.col("recommended_cluster_offers"), F.array().cast(T.ArrayType(T.StringType()))))\
                                     .orderBy('client_id')
df_allocation.show()

+--------------------+--------------------+--------------------------+--------------------------+
|           client_id|  recommended_offers|recommended_popular_offers|recommended_cluster_offers|
+--------------------+--------------------+--------------------------+--------------------------+
|0009655768c64bdeb...|[f19421c1d4aa4097...|      [9b98b8c7a33c4b65...|      [0b1e1539f2cc45b7...|
|00116118485d4dfda...|[f19421c1d4aa4097...|                        []|      [4d5c57ea9a6940dd...|
|0011e0d4e6b944f99...|[9b98b8c7a33c4b65...|                        []|      [f19421c1d4aa4097...|
|0020c2b971eb4e918...|[fafdcd668e3743c1...|      [9b98b8c7a33c4b65...|      [0b1e1539f2cc45b7...|
|0020ccbbb6d84e358...|[2298d6c36e964ae4...|      [fafdcd668e3743c1...|      [0b1e1539f2cc45b7...|
|003d66b6608740288...|[fafdcd668e3743c1...|                        []|      [9b98b8c7a33c4b65...|
|00426fe3ffde4c6b9...|[fafdcd668e3743c1...|                        []|      [3f207df678b143ee...|
|004b041fbfe448599..

In [188]:
df_allocation.write.format("parquet").mode("overwrite").save("../data/refined/allocation")