# Ranking System

In this notebook we create a system to rank the merchants within each segment. 

First create a spark session and read in the engineered data.

In [1]:
from pyspark.sql import SparkSession, functions as f

In [2]:
spark = (
    SparkSession.builder.appName("ranking_system")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/09/30 10:47:07 WARN Utils: Your hostname, LAPTOP-RELH58H1 resolves to a loopback address: 127.0.1.1; using 172.19.22.4 instead (on interface eth0)
23/09/30 10:47:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/30 10:47:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ranking_df = spark.read.parquet("../../../data/ranking_data.parquet")

                                                                                

In [4]:
ranking_df.show()

23/09/30 10:47:23 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------+--------------------+-------------+---------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+----------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+---------------------------------+
|merchant_abn|                name|revenue_level|take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|consumer_retainability|number_of_orders|average

Standardise the numerical data to then use for weighted ranking system

In [7]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline

In [10]:
# Columns to be standardized
numerical_columns = ['take_rate', 'number_of_unique_consumers', 'number_of_repeat_consumers', 
                     'average_repeat_transactions_per_consumer', 'consumer_retainability', 
                     'number_of_orders', 'average_cost_of_order', 'average_spend_per_consumer', 
                     'average_monthly_diff_consumers','consumer_diff_over_period', 'average_growth_consumers', 
                     'merchant_revenue_rounded', 'transcation_period_months', 
                     'number_of_postcodes', 'avg_total_weekly_personal_income', 
                     'avg_total_weekly_fam_income', 'avg_median_age', 'avg_household_size', 
                     'avg_num_of_consumers_per_postcode', 'predicted_num_of_unique_customers']


# Create a VectorAssembler to assemble the numerical features into a vector
assembler = VectorAssembler(inputCols=numerical_columns, outputCol='features')

# Create a MinMaxScaler to scale the features
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

# Define a UDF to extract elements from the dense vector
extract_feature_udf = udf(lambda vector, i: float(vector[i]), DoubleType())

# Create a Pipeline to chain assembling and scaling stages
pipeline = Pipeline(stages=[assembler, scaler])

# Fit and transform the data using the pipeline
model = pipeline.fit(ranking_df)
scaled_df = model.transform(ranking_df)

# Replace the original columns with MinMax scaled values
for i, col in enumerate(numerical_columns):
    scaled_df = scaled_df.withColumn(col, extract_feature_udf('scaled_features', lit(i)))

# Drop the 'scaled_features' column as it is no longer needed
scaled_df = scaled_df.drop('scaled_features', 'features')

# Show the first few rows of the DataFrame with MinMax scaled values
scaled_df.show(truncate=False)

[Stage 14:>                                                         (0 + 1) / 1]

+------------+-----------------------------------+-------------+-------------------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+---------------------+---------------------+--------------------------+------------------------------+-------------------------+------------------------+------------------------+--------------------------+-------------------------+-------------------------+--------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+----------------------+---------------------------------+
|merchant_abn|name                               |revenue_level|take_rate          |average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_pe

                                                                                