In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, countDistinct, count
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('loan_recommendation_').config("spark.driver.memory", "15g").getOrCreate()

In [4]:
loans_df = spark.read.csv("Loan recommendation/Loan_recommedation_demo_version.csv",inferSchema = True, header=True)
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: double (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: double (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [5]:
loans_df.show(vertical=True)

-RECORD 0------------------------------------------
 Contact__c                 | 0032x000003RvgkAAC   
 Min_IT_Loan_ID__c          | 5145623              
 Opp_Number__c              | 4369500              
 Id                         | 0062x00000D178JAAR   
 AccountID                  | 0012x000004VG7MAAW   
 Number_Of_Loans_Granted__c | 7                    
 Num_Of_Loans_Paid__c       | 4.0                  
 Purpose_of_Loan__c         | Relocation Expenses  
 Total_Repayments__c        | 2                    
 Amount                     | 200                  
 Term_in_Weeks__c           | 6.857                
 Payment_Frequency__c       | Monthly              
 StageName                  | Loan Paid            
 userId                     | 1641                 
 loanId                     | 35                   
 count                      | 1.0                  
 LoanIdFormat               | 1                    
-RECORD 1------------------------------------------
 Contact__c 

In [9]:
loans_df.orderBy("userId", "loanId").select(["userId","loanId","count"]).show(20)

+------+------+-----+
|userId|loanId|count|
+------+------+-----+
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
|     1|   757|  5.0|
+------+------+-----+
only showing top 20 rows



In [11]:
# Count the number of unique items
num_unique_items = loans_df.select('loanId').distinct().count()
print(f"Number of unique loans: {num_unique_items}")

# Count the number of unique users
num_unique_users = loans_df.select('userId').distinct().count()
print(f"Number of unique users: {num_unique_users}")

Number of unique items: 4594
Number of unique users: 2880


In [None]:
# # For example, 30% of items will be masked
# percent_items_to_mask = 0.3 
# # Determine the number of items to mask for each user
# df_rec_final = df_rec_filtered.withColumn("num_items_to_mask", (col("num_items") * percent_items_to_mask).cast("int"))
# # Masks items for each user
# df_rec_final = df_rec_final.withColumn("item_rank", rank().over(user_window))

# # Create a StringIndexer model to index the user ID column
# indexer_user = StringIndexer(inputCol='userId', outputCol='userIndex').setHandleInvalid("keep")
# indexer_item = StringIndexer(inputCol='itemId', outputCol='itemIndex').setHandleInvalid("keep")

# # Fit the indexer model to the data and transform the DataFrame
# df_rec_final = indexer_user.fit(df_rec_final).transform(df_rec_final)
# df_rec_final = indexer_item.fit(df_rec_final).transform(df_rec_final)

# # Convert the userIndex column to integer type
# df_rec_final = df_rec_final.withColumn('userIndex', df_rec_final['userIndex'].cast('integer'))\
#                .withColumn('itemIndex', df_rec_final['itemIndex'].cast('integer'))

# train_df_rec = df_rec_final.filter(col("item_rank") > col("num_items_to_mask"))
# test_df_rec = df_rec_final.filter(col("item_rank") <= col("num_items_to_mask"))

In [16]:
training_df, validation_df = loans_df.randomSplit([.7,.3])

In [18]:
# Configure the ALS model
als = ALS(userCol='userId', itemCol='loanId', ratingCol='count',
          coldStartStrategy='drop', nonnegative=True)

param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [1, 20, 30])\
             .addGrid(als.maxIter, [20])\
             .addGrid(als.regParam, [.05, .15])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='count', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

In [19]:
model = cv.fit(training_df)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

rank:  30
MaxIter:  20
RegParam:  0.05


In [21]:
# Train the model using the training data
model = als.fit(training_df)

# Generate predictions on the test data
predictions = best_model.transform(validation_df)
predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 5 THEN 5 ELSE prediction END"))

evaluator = RegressionEvaluator(metricName='rmse', labelCol='count', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 16.734374641288362


In [28]:
predictions.columns

['Contact__c',
 'Min_IT_Loan_ID__c',
 'Opp_Number__c',
 'Id',
 'AccountID',
 'Number_Of_Loans_Granted__c',
 'Num_Of_Loans_Paid__c',
 'Purpose_of_Loan__c',
 'Total_Repayments__c',
 'Amount',
 'Term_in_Weeks__c',
 'Payment_Frequency__c',
 'StageName',
 'userId',
 'loanId',
 'count',
 'LoanIdFormat',
 'prediction']

In [37]:
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql.functions import col, collect_list

# Convert the predictions DataFrame to include all predictions per user
# Generate top-k recommendations for each user
userRecs = best_model.recommendForAllUsers(5)  # Top-100 recommendations for each user

# Prepare the input for RankingMetrics
user_ground_truth = validation_df.groupby('userId').agg(collect_list('loanId').alias('ground_truth_items'))
user_train_items = training_df.groupby('userId').agg(collect_list('loanId').alias('train_items'))

# Join the recommendations and ground truth data on the user ID
user_eval = userRecs.join(user_ground_truth, on='userId').join(user_train_items, on='userId') \
    .select('userId', 'recommendations.loanId', 'ground_truth_items', 'train_items', 'recommendations.rating')

In [38]:
user_eval.show()

+------+--------------------+--------------------+--------------------+--------------------+
|userId|              loanId|  ground_truth_items|         train_items|              rating|
+------+--------------------+--------------------+--------------------+--------------------+
|     1|[2340, 752, 2539,...|[757, 757, 757, 7...|[757, 757, 757, 7...|[34.93072, 33.803...|
|     3|[196, 999, 550, 8...|[2521, 2521, 2521...|[2521, 2521, 2521...|[30.76846, 29.876...|
|    12|[55, 50, 49, 48, 47]|[3460, 3460, 3460...|[3460, 3460, 3460...|[0.0, 0.0, 0.0, 0...|
|    13|[2200, 3057, 3752...|[110, 110, 110, 1...|[110, 110, 110, 1...|[24.66851, 23.622...|
|    16|[1888, 96, 4152, ...|[628, 628, 628, 6...|[628, 628, 628, 6...|[29.585476, 29.54...|
|    20|[96, 4152, 2378, ...|[3164, 3164, 3164...|[3164, 3164, 3164...|[27.812405, 26.74...|
|    22|[196, 999, 550, 8...|[3651, 3651, 3651...|[3651, 3651, 3651...|[32.11654, 31.185...|
|    26|[3649, 2340, 2550...|[2256, 2256, 2256...|[2256, 2256, 2256...

In [39]:
user_eval_pandas = user_eval.toPandas()

In [40]:
user_eval_pandas.head()

Unnamed: 0,userId,loanId,ground_truth_items,train_items,rating
0,1,"[2340, 752, 2539, 2338, 2342]","[757, 757, 757, 757, 757, 757, 757]","[757, 757, 757, 757, 757, 757, 757, 757, 757, ...","[34.930721282958984, 33.80392074584961, 25.342..."
1,3,"[196, 999, 550, 862, 1005]","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[30.76845932006836, 29.876619338989258, 28.984..."
2,12,"[55, 50, 49, 48, 47]","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[0.0, 0.0, 0.0, 0.0, 0.0]"
3,13,"[2200, 3057, 3752, 2340, 4441]","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[24.66851043701172, 23.622922897338867, 23.572..."
4,16,"[1888, 96, 4152, 2206, 108]","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[29.58547592163086, 29.548614501953125, 28.412..."


In [41]:
user_eval_pandas['itemIndex_filtered'] = user_eval_pandas.apply(lambda x:[b for (b,z) in zip(x.loanId, x.rating) if b not in x.train_items], axis=1)
user_eval_pandas['rating_filtered'] = user_eval_pandas.apply(lambda x:[z for (b,z) in zip(x.loanId, x.rating) if b not in x.train_items], axis=1)

In [42]:
user_eval_pandas.head()

Unnamed: 0,userId,loanId,ground_truth_items,train_items,rating,itemIndex_filtered,rating_filtered
0,1,"[2340, 752, 2539, 2338, 2342]","[757, 757, 757, 757, 757, 757, 757]","[757, 757, 757, 757, 757, 757, 757, 757, 757, ...","[34.930721282958984, 33.80392074584961, 25.342...","[2340, 752, 2539, 2338, 2342]","[34.930721282958984, 33.80392074584961, 25.342..."
1,3,"[196, 999, 550, 862, 1005]","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[30.76845932006836, 29.876619338989258, 28.984...","[196, 999, 550, 862, 1005]","[30.76845932006836, 29.876619338989258, 28.984..."
2,12,"[55, 50, 49, 48, 47]","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[0.0, 0.0, 0.0, 0.0, 0.0]","[55, 50, 49, 48, 47]","[0.0, 0.0, 0.0, 0.0, 0.0]"
3,13,"[2200, 3057, 3752, 2340, 4441]","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[24.66851043701172, 23.622922897338867, 23.572...","[2200, 3057, 3752, 2340, 4441]","[24.66851043701172, 23.622922897338867, 23.572..."
4,16,"[1888, 96, 4152, 2206, 108]","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[29.58547592163086, 29.548614501953125, 28.412...","[1888, 96, 4152, 2206, 108]","[29.58547592163086, 29.548614501953125, 28.412..."


In [44]:
import numpy as np
import math
def score(predicted, actual, metric):
        """
        Parameters
        ----------
        predicted : List
            List of predicted apps.
        actual : List
            List of masked apps.
        metric : 'precision' or 'ndcg'
            A valid metric for recommendation.
        Raises
        -----
        Returns
        -------
        m : float
            score.
        """
        valid_metrics = ['precision', 'ndcg']
        if metric not in valid_metrics:
            raise Exception(f"Choose one valid baseline in the list: {valid_metrics}")
        if metric == 'precision':
            m = np.mean([float(len(set(predicted[:k]) 
                                               & set(actual))) / float(k) 
                                     for k in range(1,len(actual)+1)])
        if metric == 'ndcg':
            v = [1 if i in actual else 0 for i in predicted]
            v_2 = [1 for i in actual]
            dcg = sum([(2**i-1)/math.log(k+2,2) for (k,i) in enumerate(v)])
            idcg = sum([(2**i-1)/math.log(k+2,2) for (k,i) in enumerate(v_2)])
            m = dcg/idcg
        return m

user_eval_pandas['precision'] = user_eval_pandas.apply(lambda x: score(x.itemIndex_filtered, x.ground_truth_items, 'precision'), axis=1)
user_eval_pandas['NDCG'] = user_eval_pandas.apply(lambda x: score(x.itemIndex_filtered, x.ground_truth_items, 'ndcg'), axis=1)

MAP = user_eval_pandas.precision.mean()
avg_NDCG = user_eval_pandas.NDCG.mean()

In [48]:
user_eval_pandas.head()

Unnamed: 0,userId,loanId,ground_truth_items,train_items,rating,itemIndex_filtered,rating_filtered,precision,NDCG
0,1,"[2340, 752, 2539, 2338, 2342]","[757, 757, 757, 757, 757, 757, 757]","[757, 757, 757, 757, 757, 757, 757, 757, 757, ...","[34.930721282958984, 33.80392074584961, 25.342...","[2340, 752, 2539, 2338, 2342]","[34.930721282958984, 33.80392074584961, 25.342...",0.0,0.0
1,3,"[196, 999, 550, 862, 1005]","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[2521, 2521, 2521, 2521, 2521, 2521, 2521, 252...","[30.76845932006836, 29.876619338989258, 28.984...","[196, 999, 550, 862, 1005]","[30.76845932006836, 29.876619338989258, 28.984...",0.0,0.0
2,12,"[55, 50, 49, 48, 47]","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[3460, 3460, 3460, 3460, 3460, 3460, 3460, 346...","[0.0, 0.0, 0.0, 0.0, 0.0]","[55, 50, 49, 48, 47]","[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
3,13,"[2200, 3057, 3752, 2340, 4441]","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[110, 110, 110, 110, 110, 110, 110, 110, 110, ...","[24.66851043701172, 23.622922897338867, 23.572...","[2200, 3057, 3752, 2340, 4441]","[24.66851043701172, 23.622922897338867, 23.572...",0.0,0.0
4,16,"[1888, 96, 4152, 2206, 108]","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[628, 628, 628, 628, 628, 628, 628, 628, 628, ...","[29.58547592163086, 29.548614501953125, 28.412...","[1888, 96, 4152, 2206, 108]","[29.58547592163086, 29.548614501953125, 28.412...",0.0,0.0


In [49]:
user_eval_pandas.describe()

Unnamed: 0,userId,precision,NDCG
count,2863.0,2863.0,2863.0
mean,1440.90744,0.0,0.0
std,831.205243,0.0,0.0
min,1.0,0.0,0.0
25%,722.5,0.0,0.0
50%,1440.0,0.0,0.0
75%,2159.5,0.0,0.0
max,2880.0,0.0,0.0
