# Ranking System

In this notebook we create a system to rank the merchants within each segment. 

First create a spark session and read in the engineered data.

In [2]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.functions import col
from pyspark.sql.functions import expr

In [2]:
spark = (
    SparkSession.builder.appName("ranking_system")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/03 17:02:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ranking_df = spark.read.parquet("../../../data/ranking_data.parquet")

                                                                                

In [4]:
ranking_df.show()

23/10/03 17:02:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+------------------+---------------------------------+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|number_of_orders|average_cost_o

Standardise the numerical data to then use for weighted ranking system

In [5]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline

In [6]:
# Columns to be standardized
numerical_columns = ['take_rate', 'average_merchant_fraud_probability','number_of_unique_consumers', 
                     'average_consumer_fraud_probability','number_of_repeat_consumers',
                     'average_repeat_transactions_per_consumer','number_of_orders','average_cost_of_order',
                     'average_spend_per_consumer','average_monthly_diff_consumers','consumer_diff_over_period',
                     'average_growth_consumers', 'merchant_revenue_rounded','transcation_period_months',
                     'number_of_postcodes','avg_total_weekly_personal_income','avg_total_weekly_fam_income',
                     'avg_median_age','avg_household_size','postcode_reach','avg_num_of_consumers_per_postcode', 
                     'bnpl_maximum_gain', 'predicted_num_of_unique_customers']


# Create a VectorAssembler to assemble the numerical features into a vector
assembler = VectorAssembler(inputCols=numerical_columns, outputCol='features')

# Create a MinMaxScaler to scale the features
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

# Define a UDF to extract elements from the dense vector
extract_feature_udf = udf(lambda vector, i: float(vector[i]), DoubleType())

# Create a Pipeline to chain assembling and scaling stages
pipeline = Pipeline(stages=[assembler, scaler])

# Fit and transform the data using the pipeline
model = pipeline.fit(ranking_df)
scaled_df = model.transform(ranking_df)

# Replace the original columns with MinMax scaled values
for i, col in enumerate(numerical_columns):
    scaled_df = scaled_df.withColumn(col, extract_feature_udf('scaled_features', lit(i)))

# Drop the 'scaled_features' column as it is no longer needed
scaled_df = scaled_df.drop('scaled_features', 'features')

# Show the first few rows of the DataFrame with MinMax scaled values
scaled_df.show(truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+------------+-------------------------------------+-------------+--------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+---------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+--------------------+--------------------------------+---------------------------+-------------------+-------------------+--------------------+---------------------------------+----------------------+---------------------+---------------------------------+
|merchant_abn|name                                 |revenue_level|take_rate           |average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transact

                                                                                

In [7]:
column_names = scaled_df.columns
for col_name in column_names:
    print(col_name)

merchant_abn
name
revenue_level
take_rate
average_merchant_fraud_probability
number_of_unique_consumers
average_consumer_fraud_probability
number_of_repeat_consumers
average_repeat_transactions_per_consumer
number_of_orders
average_cost_of_order
average_spend_per_consumer
average_monthly_diff_consumers
consumer_diff_over_period
average_growth_consumers
merchant_revenue_rounded
first_recorded_transaction
last_recorded_transaction
transcation_period_months
number_of_postcodes
avg_total_weekly_personal_income
avg_total_weekly_fam_income
avg_median_age
avg_household_size
postcode_reach
avg_num_of_consumers_per_postcode
segment
bnpl_maximum_gain
predicted_num_of_unique_customers


In [8]:
WEIGHT = [-0.1, # for attributes that have a negative impact on merchant
		  0, # no impact
		  0.01, # min positive impact
		  0.02, 
		  0.05, 
		  0.1, 
		  0.2 # max positive impact
]

# Define the weights for each attribute as a dictionary
attribute_weights = {
	'take_rate': WEIGHT[1],
	'average_merchant_fraud_probability': WEIGHT[0],
	'number_of_unique_consumers': WEIGHT[4], 
	'average_consumer_fraud_probability': WEIGHT[0],
	'number_of_repeat_consumers': WEIGHT[3],
	'average_repeat_transactions_per_consumer': WEIGHT[3],
	'number_of_orders': WEIGHT[4],
	'average_cost_of_order': WEIGHT[5],
	'average_spend_per_consumer': WEIGHT[5],
	'average_monthly_diff_consumers': WEIGHT[1],
	'consumer_diff_over_period': WEIGHT[1],
	'average_growth_consumers': WEIGHT[5], 
	'merchant_revenue_rounded': WEIGHT[1],
	'transcation_period_months': WEIGHT[1],
	'number_of_postcodes': WEIGHT[1],
	'avg_total_weekly_personal_income': WEIGHT[2],
	'avg_total_weekly_fam_income': WEIGHT[2],
	'avg_median_age': WEIGHT[2],
	'avg_household_size': WEIGHT[2],
	'postcode_reach': WEIGHT[2],
	'avg_num_of_consumers_per_postcode': WEIGHT[2],
	'bnpl_maximum_gain' : WEIGHT[6],
	'predicted_num_of_unique_customers': WEIGHT[5]
}

# Create a SQL expression to calculate the weighted sum
weighted_sum_expr = " + ".join([f"{column_name} * {attribute_weights[column_name]}" for column_name in attribute_weights.keys()])

# Calculate the weighted sum using selectExpr
scaled_df_with_weighted_sum = scaled_df.selectExpr("*", f"({weighted_sum_expr}) as weighted_sum")

# Show the DataFrame with the new weighted sum column
scaled_df_with_weighted_sum.show()

[Stage 6:>                                                          (0 + 1) / 1]

+------------+--------------------+-------------+--------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+--------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+--------------------+--------------------------------+---------------------------+-------------------+-------------------+--------------------+---------------------------------+--------------------+--------------------+---------------------------------+--------------------+
|merchant_abn|                name|revenue_level|           take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer

                                                                                

In [9]:
# Rank the entries by the "weighted_sum" column in descending order
ranked_df = scaled_df_with_weighted_sum.orderBy("weighted_sum", ascending=False)

# Limit the result to the top 100 entries
top_100_entries = ranked_df.limit(100)

# Show the top 100 entries
top_100_entries.show()


                                                                                

+------------+--------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+-------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+-------------------+-------------------+------------------+---------------------------------+--------------------+-------------------+---------------------------------+-------------------+
|merchant_abn|                name|revenue_level|          take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|   numb

In [10]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Assuming you have a DataFrame named scaled_df_with_weighted_sum
window_spec = Window.partitionBy("segment").orderBy(F.col("weighted_sum").desc())

ranked_df = scaled_df_with_weighted_sum.withColumn("rank", F.rank().over(window_spec))
top_10_entries = ranked_df.filter(F.col("rank") <= 10).select("rank", "merchant_abn", "name", "weighted_sum", "segment")

In [11]:
# Iterate through distinct segments and print top 10 entries for each
distinct_segments = scaled_df_with_weighted_sum.select("segment").distinct().rdd.flatMap(lambda x: x).collect()

for segment in distinct_segments:
    print(f"Top 10 entries for segment '{segment}':")
    top_10_entries.filter(F.col("segment") == segment).show()

                                                                                

Top 10 entries for segment 'garden_and_furnishings':


                                                                                

+----+------------+--------------------+-------------------+--------------------+
|rank|merchant_abn|                name|       weighted_sum|             segment|
+----+------------+--------------------+-------------------+--------------------+
|   1| 89726005175| Est Nunc Consulting|0.41734996963338084|garden_and_furnis...|
|   2| 49891706470|Non Vestibulum In...|0.38574265868558677|garden_and_furnis...|
|   3| 43186523025|Lorem Ipsum Sodal...|0.37805848421291766|garden_and_furnis...|
|   4| 24852446429|      Erat Vitae LLP| 0.3422026991901862|garden_and_furnis...|
|   5| 64203420245|  Pede Nonummy Corp.| 0.3273069468813284|garden_and_furnis...|
|   6| 76767266140|Phasellus At Limited| 0.3030146875121147|garden_and_furnis...|
|   7| 79827781481|     Amet Risus Inc.|0.29443017954641937|garden_and_furnis...|
|   8| 38090089066|Interdum Feugiat ...| 0.2804139895579167|garden_and_furnis...|
|   9| 49212265466|      Auctor Company| 0.2705260731171425|garden_and_furnis...|
|  10| 387000389

                                                                                

+----+------------+--------------------+-------------------+--------------------+
|rank|merchant_abn|                name|       weighted_sum|             segment|
+----+------------+--------------------+-------------------+--------------------+
|   1| 86578477987|   Leo In Consulting| 0.4494629816199737|antiques_and_jewe...|
|   2| 49322182190|Gravida Mauris In...| 0.3455508864051446|antiques_and_jewe...|
|   3| 68559320474|Aliquam Auctor As...|  0.260028974096704|antiques_and_jewe...|
|   4| 23338656015|         Iaculis LLC|0.23129968810980928|antiques_and_jewe...|
|   5| 98166254020|Magna Sed Industries|0.22356503893007704|antiques_and_jewe...|
|   6| 29616684420|       Tellus Id LLC|0.22168235065717795|antiques_and_jewe...|
|   7| 81761494572|Nulla Facilisis I...|0.20568150700631418|antiques_and_jewe...|
|   8| 71528203369|Ipsum Primis Asso...| 0.1939285220562131|antiques_and_jewe...|
|   9| 62224020443|Hendrerit A Corpo...|0.18613222291968762|antiques_and_jewe...|
|  10| 766261198

                                                                                

+----+------------+--------------------+-------------------+--------------------+
|rank|merchant_abn|                name|       weighted_sum|             segment|
+----+------------+--------------------+-------------------+--------------------+
|   1| 48534649627|Dignissim Maecena...| 0.3876449874499129|specialized_services|
|   2| 96680767841|      Ornare Limited| 0.3226991349049874|specialized_services|
|   3| 46804135891|Suspendisse Dui C...| 0.3210693579419936|specialized_services|
|   4| 13514558491|   Magna Praesent PC|0.23307562410946572|specialized_services|
|   5| 31385641294|    Semper Auctor PC|0.22347323300569882|specialized_services|
|   6| 22033359776|Suspendisse Non L...|0.21208842557297042|specialized_services|
|   7| 21359184622|         Sit Amet PC|0.19677221076169504|specialized_services|
|   8| 11566786699|Euismod Et Institute|0.18868492527953906|specialized_services|
|   9| 41251795489|Ultricies Sem Lim...|0.18371224612992432|specialized_services|
|  10| 955747568

                                                                                

+----+------------+--------------------+-------------------+--------------------+
|rank|merchant_abn|                name|       weighted_sum|             segment|
+----+------------+--------------------+-------------------+--------------------+
|   1| 21439773999|Mauris Non Institute| 0.4146295864445008|tech_and_electronics|
|   2| 72472909171|   Nullam Consulting| 0.3769932535327337|tech_and_electronics|
|   3| 49505931725|Suspendisse Ac As...|0.32118604838700543|tech_and_electronics|
|   4| 68216911708|Placerat Eget Ven...| 0.3172879684522141|tech_and_electronics|
|   5| 35909341340|Arcu Sed Eu Incor...|0.30153220366344635|tech_and_electronics|
|   6| 67400260923|         Eleifend PC|  0.269355313430997|tech_and_electronics|
|   7| 94690988633|     Eu Placerat LLC| 0.2684880384466785|tech_and_electronics|
|   8| 58454491168|  Diam At Foundation| 0.2639522866247447|tech_and_electronics|
|   9| 80518954462|Neque Sed Dictum ...|0.24310657921255988|tech_and_electronics|
|  10| 454334764

[Stage 25:>                                                         (0 + 1) / 1]

+----+------------+--------------------+-------------------+------------------+
|rank|merchant_abn|                name|       weighted_sum|           segment|
+----+------------+--------------------+-------------------+------------------+
|   1| 45629217853|    Lacus Consulting|0.43543362270533703|retail_and_novelty|
|   2| 32361057556|Orci In Consequat...|0.42110736102300905|retail_and_novelty|
|   3| 64403598239|Lobortis Ultrices...| 0.4104068037906187|retail_and_novelty|
|   4| 94493496784|Dictum Phasellus ...|0.38791160269829417|retail_and_novelty|
|   5| 79417999332|Phasellus At Company| 0.3733055706048901|retail_and_novelty|
|   6| 60956456424|Ultricies Digniss...|0.34956107348698906|retail_and_novelty|
|   7| 63290521567|Vehicula Pellente...|0.33782786101852486|retail_and_novelty|
|   8| 98973094975|   Ornare Fusce Inc.|   0.32263932964974|retail_and_novelty|
|   9| 80324045558|Ipsum Dolor Sit C...|0.28772527407167814|retail_and_novelty|
|  10| 81219314324|    Faucibus Leo Ltd|

                                                                                