In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, countDistinct, count
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('loan_recommendation_using_git_with_alterations').config("spark.driver.memory", "15g").getOrCreate()

In [3]:
loans_df = spark.read.csv("Loan recommendation/Loan_recommedation_demo_version.csv",inferSchema = True, header=True)
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [4]:
loans_df.show(vertical=True)

-RECORD 0------------------------------------------
 Contact__c                 | 0030K00001QYaZ1QAL   
 Min_IT_Loan_ID__c          | 835107               
 Opp_Number__c              | 4430711              
 Id                         | 0062x00000D2AEoAAN   
 AccountID                  | 0010K00001g7iB1QAI   
 Number_Of_Loans_Granted__c | 52                   
 Num_Of_Loans_Paid__c       | 49                   
 Purpose_of_Loan__c         | Vehicle Expenses     
 Total_Repayments__c        | 14                   
 Amount                     | 200                  
 Term_in_Weeks__c           | 13.714               
 Payment_Frequency__c       | Weekly               
 StageName                  | Loan Paid            
 userId                     | 1211                 
 loanId                     | 323                  
 count                      | 46                   
 LoanIdFormat               | 2                    
-RECORD 1------------------------------------------
 Contact__c 

In [5]:
loans_df.orderBy("userId", "loanId").select(["userId","loanId","count"]).show(20)

+------+------+-----+
|userId|loanId|count|
+------+------+-----+
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
|     1|   757|    5|
+------+------+-----+
only showing top 20 rows



In [6]:
# Count the number of unique items
num_unique_items = loans_df.select('loanId').distinct().count()
print(f"Number of unique loans: {num_unique_items}")

# Count the number of unique users
num_unique_users = loans_df.select('userId').distinct().count()
print(f"Number of unique users: {num_unique_users}")

Number of unique loans: 4572
Number of unique users: 2864


In [None]:
# # For example, 30% of items will be masked
# percent_items_to_mask = 0.3 
# # Determine the number of items to mask for each user
# df_rec_final = df_rec_filtered.withColumn("num_items_to_mask", (col("num_items") * percent_items_to_mask).cast("int"))
# # Masks items for each user
# df_rec_final = df_rec_final.withColumn("item_rank", rank().over(user_window))

# # Create a StringIndexer model to index the user ID column
# indexer_user = StringIndexer(inputCol='userId', outputCol='userIndex').setHandleInvalid("keep")
# indexer_item = StringIndexer(inputCol='itemId', outputCol='itemIndex').setHandleInvalid("keep")

# # Fit the indexer model to the data and transform the DataFrame
# df_rec_final = indexer_user.fit(df_rec_final).transform(df_rec_final)
# df_rec_final = indexer_item.fit(df_rec_final).transform(df_rec_final)

# # Convert the userIndex column to integer type
# df_rec_final = df_rec_final.withColumn('userIndex', df_rec_final['userIndex'].cast('integer'))\
#                .withColumn('itemIndex', df_rec_final['itemIndex'].cast('integer'))

# train_df_rec = df_rec_final.filter(col("item_rank") > col("num_items_to_mask"))
# test_df_rec = df_rec_final.filter(col("item_rank") <= col("num_items_to_mask"))

In [21]:
training_df, validation_df = loans_df.randomSplit([.8,.2])

In [18]:
# Configure the ALS model
als = ALS(userCol='userId', itemCol='loanId', ratingCol='count',
          coldStartStrategy='drop', nonnegative=True)

param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [1, 4, 10, 20, 30])\
             .addGrid(als.maxIter, [10 ,12,18,20])\
             .addGrid(als.regParam, [0.001, 0.01, .05, .15])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='count', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

In [19]:
model = cv.fit(training_df)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

rank:  30
MaxIter:  10
RegParam:  0.001


In [24]:
rank   =  30
MaxIter   =  10
RegParam   =  0.0001
als_using_best_params = ALS(maxIter = MaxIter,regParam = RegParam,rank = rank,
        userCol='userId', itemCol='loanId', ratingCol='count',
          coldStartStrategy='drop', nonnegative=True)
# Train the model using the training data
# als_using_best_params = ALS(maxIter = best_model._java_obj.parent().getMaxIter(),regParam = best_model._java_obj.parent().getRegParam(),rank = best_model.rank,
#         userCol='userId', itemCol='loanId', ratingCol='count',
#           coldStartStrategy='drop', nonnegative=True)
model = als_using_best_params.fit(training_df)

# Generate predictions on the test data
predictions = best_model.transform(validation_df)
predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 5 THEN 5 ELSE prediction END"))

evaluator = RegressionEvaluator(metricName='rmse', labelCol='count', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 16.807730251996215


In [28]:
predictions.columns

['Contact__c',
 'Min_IT_Loan_ID__c',
 'Opp_Number__c',
 'Id',
 'AccountID',
 'Number_Of_Loans_Granted__c',
 'Num_Of_Loans_Paid__c',
 'Purpose_of_Loan__c',
 'Total_Repayments__c',
 'Amount',
 'Term_in_Weeks__c',
 'Payment_Frequency__c',
 'StageName',
 'userId',
 'loanId',
 'count',
 'LoanIdFormat',
 'prediction']

In [31]:
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql.functions import col, collect_list

# Convert the predictions DataFrame to include all predictions per user
# Generate top-k recommendations for each user
userRecs = best_model.recommendForAllUsers(5)  # Top-100 recommendations for each user

# Prepare the input for RankingMetrics
user_ground_truth = validation_df.groupby('userId').agg(collect_list('loanId').alias('ground_truth_items'))
user_train_items = training_df.groupby('userId').agg(collect_list('loanId').alias('train_items'))

# Join the recommendations and ground truth data on the user ID
user_eval = userRecs.join(user_ground_truth, on='userId').join(user_train_items, on='userId') \
    .select('userId', 'recommendations.rating', 'ground_truth_items', 'train_items', 'recommendations.rating')

In [32]:
user_eval.show()

+------+--------------------+--------------------+--------------------+--------------------+
|userId|              rating|  ground_truth_items|         train_items|              rating|
+------+--------------------+--------------------+--------------------+--------------------+
|     1|[34.93072, 33.803...|[757, 757, 757, 7...|[757, 757, 757, 7...|[34.93072, 33.803...|
|     3|[30.76846, 29.876...|[2521, 2521, 2521...|[2521, 2521, 2521...|[30.76846, 29.876...|
|    12|[0.0, 0.0, 0.0, 0...|[3460, 3460, 3460...|[3460, 3460, 3460...|[0.0, 0.0, 0.0, 0...|
|    13|[24.66851, 23.622...|[110, 110, 110, 1...|[110, 110, 110, 1...|[24.66851, 23.622...|
|    16|[29.585476, 29.54...|[628, 628, 628, 6...|[628, 628, 628, 6...|[29.585476, 29.54...|
|    20|[27.812405, 26.74...|[3164, 3164, 3164...|[3164, 3164, 3164...|[27.812405, 26.74...|
|    22|[32.11654, 31.185...|[3651, 3651, 3651...|[3651, 3651, 3651...|[32.11654, 31.185...|
|    26|[18.452265, 16.93...|[2256, 2256, 2256...|[2256, 2256, 2256...

In [None]:
user_eval = user_eval.toPandas()
user_eval['itemIndex_filtered'] = user_eval.apply(lambda x:[b for (b,z) in zip(x.itemIndex, x.rating) if b not in x.train_items], axis=1)
user_eval['rating_filtered'] = user_eval.apply(lambda x:[z for (b,z) in zip(x.itemIndex, x.rating) if b not in x.train_items], axis=1)

In [None]:
import numpy as np
import math
def score(predicted, actual, metric):
        """
        Parameters
        ----------
        predicted : List
            List of predicted apps.
        actual : List
            List of masked apps.
        metric : 'precision' or 'ndcg'
            A valid metric for recommendation.
        Raises
        -----
        Returns
        -------
        m : float
            score.
        """
        valid_metrics = ['precision', 'ndcg']
        if metric not in valid_metrics:
            raise Exception(f"Choose one valid baseline in the list: {valid_metrics}")
        if metric == 'precision':
            m = np.mean([float(len(set(predicted[:k]) 
                                               & set(actual))) / float(k) 
                                     for k in range(1,len(actual)+1)])
        if metric == 'ndcg':
            v = [1 if i in actual else 0 for i in predicted]
            v_2 = [1 for i in actual]
            dcg = sum([(2**i-1)/math.log(k+2,2) for (k,i) in enumerate(v)])
            idcg = sum([(2**i-1)/math.log(k+2,2) for (k,i) in enumerate(v_2)])
            m = dcg/idcg
        return m

user_eval['precision'] = user_eval.apply(lambda x: score(x.itemIndex_filtered, x.ground_truth_items, 'precision'), axis=1)
user_eval['NDCG'] = user_eval.apply(lambda x: score(x.itemIndex_filtered, x.ground_truth_items, 'ndcg'), axis=1)

MAP = user_eval.precision.mean()
avg_NDCG = user_eval.NDCG.mean()

In [3]:
sc = SparkContext(appName="PythonCollaborativeFilteringExample")

In [4]:
sc

In [5]:
loans_df = sc.read.csv("Loan_recommedation_demo_version.csv",inferSchema = True, header=True)
loans_df.printSchema()

AttributeError: 'SparkContext' object has no attribute 'read'