# Ranking System

In this notebook we create a system to rank the merchants within each segment. 

First create a spark session and read in the engineered data.

In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.functions import col
from pyspark.sql.functions import expr

In [2]:
spark = (
    SparkSession.builder.appName("ranking_system")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/02 12:15:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ranking_df = spark.read.parquet("../../../data/ranking_data.parquet")

                                                                                

In [4]:
ranking_df.show()

23/10/02 12:15:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+------------+--------------------+-------------+---------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+----------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+---------------------------------+
|merchant_abn|                name|revenue_level|take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|consumer_retainability|number_of_orders|average

                                                                                

Standardise the numerical data to then use for weighted ranking system

In [5]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline

In [6]:
# Columns to be standardized
numerical_columns = ['take_rate', 'number_of_unique_consumers', 'number_of_repeat_consumers', 
                     'average_repeat_transactions_per_consumer', 'consumer_retainability', 
                     'number_of_orders', 'average_cost_of_order', 'average_spend_per_consumer', 
                     'average_monthly_diff_consumers','consumer_diff_over_period', 'average_growth_consumers', 
                     'merchant_revenue_rounded', 'transcation_period_months', 
                     'number_of_postcodes', 'avg_total_weekly_personal_income', 
                     'avg_total_weekly_fam_income', 'avg_median_age', 'avg_household_size', 
                     'avg_num_of_consumers_per_postcode', 'predicted_num_of_unique_customers']


# Create a VectorAssembler to assemble the numerical features into a vector
assembler = VectorAssembler(inputCols=numerical_columns, outputCol='features')

# Create a MinMaxScaler to scale the features
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

# Define a UDF to extract elements from the dense vector
extract_feature_udf = udf(lambda vector, i: float(vector[i]), DoubleType())

# Create a Pipeline to chain assembling and scaling stages
pipeline = Pipeline(stages=[assembler, scaler])

# Fit and transform the data using the pipeline
model = pipeline.fit(ranking_df)
scaled_df = model.transform(ranking_df)

# Replace the original columns with MinMax scaled values
for i, col in enumerate(numerical_columns):
    scaled_df = scaled_df.withColumn(col, extract_feature_udf('scaled_features', lit(i)))

# Drop the 'scaled_features' column as it is no longer needed
scaled_df = scaled_df.drop('scaled_features', 'features')

# Show the first few rows of the DataFrame with MinMax scaled values
scaled_df.show(truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+------------+-----------------------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+---------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+--------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+----------------------+---------------------------------+
|merchant_abn|name                               |revenue_level|take_rate          |average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_pe

                                                                                

In [7]:
column_names = scaled_df.columns
for col_name in column_names:
    print(col_name)

merchant_abn
name
revenue_level
take_rate
average_merchant_fraud_probability
number_of_unique_consumers
average_consumer_fraud_probability
number_of_repeat_consumers
average_repeat_transactions_per_consumer
consumer_retainability
number_of_orders
average_cost_of_order
average_spend_per_consumer
average_monthly_diff_consumers
consumer_diff_over_period
average_growth_consumers
merchant_revenue_rounded
first_recorded_transaction
last_recorded_transaction
transcation_period_months
number_of_postcodes
avg_total_weekly_personal_income
avg_total_weekly_fam_income
avg_median_age
avg_household_size
postcode_reach
avg_num_of_consumers_per_postcode
segment
predicted_num_of_unique_customers


In [8]:
NUM_FEATURES = 21
UNI_WEIGHT = 1/NUM_FEATURES

# Define the weights for each attribute as a dictionary
# Replace 'attribute_name' with the actual attribute names and assign their weights
attribute_weights = {
	'take_rate': UNI_WEIGHT,
	'average_merchant_fraud_probability': 0,
	'number_of_unique_consumers' : UNI_WEIGHT,
	'average_consumer_fraud_probability': 0,
	'number_of_repeat_consumers': UNI_WEIGHT,
	'average_repeat_transactions_per_consumer': UNI_WEIGHT,
	'consumer_retainability': UNI_WEIGHT,
	'number_of_orders': UNI_WEIGHT,
	'average_cost_of_order': UNI_WEIGHT,
	'average_spend_per_consumer': UNI_WEIGHT,
	'average_monthly_diff_consumers': UNI_WEIGHT,
	'consumer_diff_over_period': UNI_WEIGHT,
	'average_growth_consumers': UNI_WEIGHT,
	'merchant_revenue_rounded': UNI_WEIGHT,
	'transcation_period_months': UNI_WEIGHT,
	'number_of_postcodes': UNI_WEIGHT,
	'avg_total_weekly_personal_income': UNI_WEIGHT,
	'avg_total_weekly_fam_income': UNI_WEIGHT,
	'avg_median_age': UNI_WEIGHT,
	'avg_household_size': UNI_WEIGHT,
	'postcode_reach': UNI_WEIGHT,
	'avg_num_of_consumers_per_postcode': UNI_WEIGHT,
	'predicted_num_of_unique_customers': UNI_WEIGHT,
}

# Create a SQL expression to calculate the weighted sum
weighted_sum_expr = " + ".join([f"{column_name} * {attribute_weights[column_name]}" for column_name in attribute_weights.keys()])

# Calculate the weighted sum using selectExpr
scaled_df_with_weighted_sum = scaled_df.selectExpr("*", f"({weighted_sum_expr}) as weighted_sum")

# Show the DataFrame with the new weighted sum column
scaled_df_with_weighted_sum.show()

[Stage 6:>                                                          (0 + 1) / 1]

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+--------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+--------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+---------------------------------+-------------------+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|co

                                                                                

In [9]:
# Rank the entries by the "weighted_sum" column in descending order
ranked_df = scaled_df_with_weighted_sum.orderBy("weighted_sum", ascending=False)

# Limit the result to the top 100 entries
top_100_entries = ranked_df.limit(100)

# Show the top 100 entries
top_100_entries.show()


                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+--------------------+---------------------------------+------------------+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|consume

In [10]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Assuming you have a DataFrame named scaled_df_with_weighted_sum
window_spec = Window.partitionBy("segment").orderBy(F.col("weighted_sum").desc())

ranked_df = scaled_df_with_weighted_sum.withColumn("rank", F.rank().over(window_spec))
top_10_entries = ranked_df.filter(F.col("rank") <= 10)

In [12]:
# Iterate through distinct segments and print top 10 entries for each
distinct_segments = scaled_df_with_weighted_sum.select("segment").distinct().rdd.flatMap(lambda x: x).collect()

for segment in distinct_segments:
    print(f"Top 10 entries for segment '{segment}':")
    top_10_entries.filter(F.col("segment") == segment).show()

                                                                                

Top 10 entries for segment 'garden_and_furnishings':


                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+--------------------+---------------------------------+------------------+----+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|co

                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+--------------------+---------------------------------+-------------------+----+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|c

                                                                                

+------------+--------------------+-------------+------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+--------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+--------------------+---------------------------------+-------------------+----+
|merchant_abn|                name|revenue_level|         take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|co

                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+--------------------+---------------------------------+-------------------+----+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|c

[Stage 28:>                                                         (0 + 1) / 1]

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+------------------+---------------------------------+------------------+---------------------------------+------------------+----+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|cons

                                                                                