In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col, when
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
import numpy as np
from IPython.display import Image
from IPython.display import display

In [43]:
from pyspark.sql.functions import expr
from pyspark.sql.window import Window

In [2]:
spark  = SparkSession.builder.appName("Loan Recommendation").getOrCreate()

In [3]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)



In [4]:
loans_df = spark.read.csv("Loan_recommedation_demo_version.csv",inferSchema = True, header=True)
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: double (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: double (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [5]:
loans_df.show(10)

+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+-------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+
|        Contact__c|Min_IT_Loan_ID__c|Opp_Number__c|                Id|         AccountID|Number_Of_Loans_Granted__c|Num_Of_Loans_Paid__c| Purpose_of_Loan__c|Total_Repayments__c|Amount|Term_in_Weeks__c|Payment_Frequency__c|StageName|userId|loanId|count|LoanIdFormat|
+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+-------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+
|0032x000003RvgkAAC|          5145623|      4369500|0062x00000D178JAAR|0012x000004VG7MAAW|                         7|                 4.0|Relocation Expenses|                  2|   200|           6.8

In [6]:
# users_df = spark.read.csv("MovieLens/movies.csv",inferSchema = True, header=True)
# users_df.printSchema()

In [10]:
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: double (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: double (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [7]:
# links_df = spark.read.csv("MovieLens/links.csv",inferSchema = True, header=True)
# links_df.printSchema()

In [8]:
training_df, validation_df = loans_df.randomSplit([.8,.2])

In [9]:
iterations = 10
regularization_parameter = 0.1
rank = 4
error = []
err = 0

In [12]:
als = ALS(maxIter = iterations,regParam = regularization_parameter,rank = rank, userCol="userId",itemCol="LoanIdFormat",ratingCol="count")
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root-mean-Square-error = "+ str(rmse))

Root-mean-Square-error = 1.518451769232758


In [18]:
new_predictions.select(["Id","AccountID","Amount","Num_Of_Loans_Paid__c","Purpose_of_Loan__c","userId","count" ,"LoanIdFormat","prediction"]).show()

+------------------+------------------+------+--------------------+--------------------+------+-----+------------+----------+
|                Id|         AccountID|Amount|Num_Of_Loans_Paid__c|  Purpose_of_Loan__c|userId|count|LoanIdFormat|prediction|
+------------------+------------------+------+--------------------+--------------------+------+-----+------------+----------+
|0062x00000Ew6DbAAJ|0010K000023zUJ8QAM|  2500|                18.0|Furniture or Appl...|  2484| 16.0|         496| 15.873513|
|0062x00000D7qkTAAR|0010K000022k8DdQAI|  1000|                22.0|    Vehicle Expenses|   436| 17.0|         148| 16.929253|
|0062x00000EysRaAAJ|0010K00001zFWRjQAO|  2300|                18.0| Veterinary Expenses|  1019| 17.0|         471| 16.984049|
|0062x00000DFcmYAAT|0010K0000260MIYQA2|  1000|                24.0|     Travel Expenses|  2150| 19.0|         148|  19.03993|
|0062x00000D1GuIAAV|0012800001SmREKAA3|   900|                14.0|    Medical Expenses|  2030| 10.0|         148|   9

In [13]:
new_predictions.show(10)

+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+--------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|        Contact__c|Min_IT_Loan_ID__c|Opp_Number__c|                Id|         AccountID|Number_Of_Loans_Granted__c|Num_Of_Loans_Paid__c|  Purpose_of_Loan__c|Total_Repayments__c|Amount|Term_in_Weeks__c|Payment_Frequency__c|StageName|userId|loanId|count|LoanIdFormat|prediction|
+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+--------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|0030K00001sAYZzQAO|          1027125|      6560979|0062x00000Ew6DbAAJ|0010K000023zUJ8QAM|                        20|                18.0|Furniture or Appl...|    

In [19]:
movieRecommends = model.recommendForAllItems(3)

In [28]:
movieRecommends.select("LoanIdFormat","recommendations.userId").show(10,False)

+------------+-----------------+
|LoanIdFormat|userId           |
+------------+-----------------+
|1           |[1635, 651, 1211]|
|2           |[997, 1401, 651] |
|3           |[709, 2085, 695] |
|4           |[1635, 651, 1211]|
|5           |[997, 1401, 651] |
|6           |[709, 2085, 695] |
|7           |[1635, 651, 1211]|
|8           |[997, 1401, 695] |
|9           |[709, 2085, 1543]|
|10          |[1635, 651, 1211]|
+------------+-----------------+
only showing top 10 rows



In [29]:
userRecommends = model.recommendForAllUsers(3)

In [35]:
userRecommends.schema

StructType([StructField('userId', IntegerType(), False), StructField('recommendations', ArrayType(StructType([StructField('LoanIdFormat', IntegerType(), True), StructField('rating', FloatType(), True)]), True), True)])

In [37]:
userRecommends.select("userId","recommendations.LoanIdFormat").show(10,False)

+------+---------------+
|userId|LoanIdFormat   |
+------+---------------+
|1     |[535, 553, 550]|
|3     |[628, 625, 622]|
|5     |[830, 827, 818]|
|6     |[358, 343, 364]|
|12    |[30, 29, 28]   |
|13    |[819, 801, 822]|
|16    |[819, 801, 822]|
|19    |[819, 801, 822]|
|20    |[628, 625, 622]|
|22    |[184, 169, 193]|
+------+---------------+
only showing top 10 rows



DataFrame[userId: int, recommendations: array<struct<LoanIdFormat:int,rating:float>>]

In [14]:
predictions.show(10)

+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|        Contact__c|Min_IT_Loan_ID__c|Opp_Number__c|                Id|         AccountID|Number_Of_Loans_Granted__c|Num_Of_Loans_Paid__c|Purpose_of_Loan__c|Total_Repayments__c|Amount|Term_in_Weeks__c|Payment_Frequency__c|StageName|userId|loanId|count|LoanIdFormat|prediction|
+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|0030K00001JVbSpQAL|           907654|      5059066|0062x00000DYsyxAAD|0010K00001ayVHPQA2|                        29|                24.0|    Other Expenses|            

In [113]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    # if len(predicted)>k:
    #     predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [112]:
count_list = new_predictions.select("count").toPandas()["count"].tolist()
prediction_list =  new_predictions.select("prediction").toPandas()["prediction"].tolist()

mapk(count_list,prediction_list,3 )

TypeError: 'float' object is not iterable

In [115]:
new_predictions.show(5)

+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+--------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|        Contact__c|Min_IT_Loan_ID__c|Opp_Number__c|                Id|         AccountID|Number_Of_Loans_Granted__c|Num_Of_Loans_Paid__c|  Purpose_of_Loan__c|Total_Repayments__c|Amount|Term_in_Weeks__c|Payment_Frequency__c|StageName|userId|loanId|count|LoanIdFormat|prediction|
+------------------+-----------------+-------------+------------------+------------------+--------------------------+--------------------+--------------------+-------------------+------+----------------+--------------------+---------+------+------+-----+------------+----------+
|0030K00001sAYZzQAO|          1027125|      6560979|0062x00000Ew6DbAAJ|0010K000023zUJ8QAM|                        20|                18.0|Furniture or Appl...|    

In [131]:
# Function to calculate MAP@K
def calculate_map_at_k(predictions, ground_truth, k):
    # Inner join to get the predicted ratings for relevant items
    joined_df = predictions.join(ground_truth, on="userId")
    # Sort recommendations by predicted rating in descending order
    window_spec = Window().partitionBy("userId").orderBy(col("count").desc())
    sorted_predictions = joined_df.select(
        "userId",
        "loanId",
        "prediction",
        expr("ROW_NUMBER() OVER (PARTITION BY userId ORDER BY count DESC) AS rank")
    ).filter(col("rank") <= k)
    print(sorted_predictions.show(10))
    # Calculate precision at each position
    precision_at_k = sorted_predictions.withColumn(
        "relevant_at_k", expr("CASE WHEN loanId IN (loanId) THEN (prediction) ELSE 0 END")
    ).groupBy("userId").agg(expr(f"SUM(relevant_at_k) / {k} AS precision_at_k"))
    print(precision_at_k.show(3))
    # Calculate MAP@K
    map_at_k = precision_at_k.agg(expr("SUM(precision_at_k) / COUNT(userId) AS map_at_k")).collect()[0]["map_at_k"]
    print("map at k ",map_at_k)
    return map_at_k

In [132]:
map_at_3 = calculate_map_at_k(new_predictions.select("Id","AccountID","prediction","LoanIdFormat","userId"), loans_df, k=3)
print(f"MAP@3: {map_at_3}")

+------+------+----------+----+
|userId|loanId|prediction|rank|
+------+------+----------+----+
|     1|   757| 4.9824653|   1|
|     1|   757| 4.9746227|   2|
|     1|   757| 4.9824653|   3|
|     2|  2248| 6.9537015|   1|
|     2|  2248| 6.9573073|   2|
|     2|  2248| 6.9537015|   3|
|     3|  2521| 3.9583652|   1|
|     3|  2521| 3.9486866|   2|
|     3|  2521| 3.9583652|   3|
|     4|  4166| 24.773582|   1|
+------+------+----------+----+
only showing top 10 rows

None
+------+------------------+
|userId|    precision_at_k|
+------+------------------+
|     1| 4.979851086934407|
|     2| 6.961169083913167|
|     3|3.9515560468037925|
+------+------------------+
only showing top 3 rows

None
map at k  11.771399787381077
MAP@3: 11.771399787381077


In [65]:
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
 
def calculate_mapk(predictions, k):
    # Group by user
    user_recs = predictions.groupBy('userId').agg(F.collect_list('prediction').alias('predictions'))
    # Calculate average precision for each user
    user_avgs = user_recs.rdd.map(lambda row: (row[0], avg_precision(row[1], k))).collect()
    # Calculate MAP@K
    mapk = sum([user_avg for (_, user_avg) in user_avgs]) / len(user_avgs)
    return mapk
def avg_precision(predictions, k):
    # Sort predictions
    sorted_preds = sorted(predictions, reverse=True)
    # Initialize counters
    num_hits, sum_precs = 0, 0.0
    # Calculate average precision
    for i, pred in enumerate(sorted_preds):
        if pred == 1:
            num_hits += 1
            sum_precs += num_hits / (i+1)
    return sum_precs / min(len(sorted_preds), k)

In [32]:
userRecommends.schema

StructType([StructField('userId', IntegerType(), False), StructField('recommendations', ArrayType(StructType([StructField('movieId', IntegerType(), True), StructField('rating', FloatType(), True)]), True), True)])

In [35]:
userRecommends.select("userId","recommendations.movieID").show(10,False)

+------+--------------------------------------+
|userId|movieID                               |
+------+--------------------------------------+
|1     |[170355, 3379, 33649, 60943, 59018]   |
|2     |[3567, 5075, 141718, 72171, 60943]    |
|3     |[74754, 99764, 26865, 148881, 3837]   |
|4     |[25825, 3567, 127108, 7834, 7841]     |
|5     |[89904, 170355, 3379, 174053, 7815]   |
|6     |[82, 87234, 3925, 42730, 6732]        |
|7     |[25771, 8477, 148881, 74754, 87234]   |
|8     |[60943, 59018, 170355, 3379, 174053]  |
|9     |[112804, 60943, 59018, 106100, 174053]|
|10    |[112804, 90888, 82, 51931, 2316]      |
+------+--------------------------------------+
only showing top 10 rows



In [None]:
lls = ALS(maxIter = iterations,regParam = regularization_parameter,rank = rank, userCol="userId",itemCol="movieId",ratingCol="rating")
paramGrid = ParamGridBuilder().addGrid(lls.regParam[0.1,0.01,0.18])\
                              .addGrid(lls.rank, range(4,10))\
                              .build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
crossval = CrossValidator(estimator = lls,
                          estimatorParamMaps = paramGrid,
                          evaluator = evaluator,
                          numfolds = 5)

cvModel = crossVal.fit(training_df)
