In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, count #countDistinct
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors  ##DenseMatrix
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import operator
import time

In [4]:
from pyspark.ml.recommendation import ALSModel

In [2]:
spark = SparkSession.builder.appName('loan_recommendation_with_clusters').config("spark.driver.memory", "12g").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/12 15:54:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/12 15:54:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# loans_df = spark.read.csv("RecommendData/df_temp_cluster_rating_condition_data.csv", inferSchema=True, header=True)
loans_df = spark.read.csv("Loan_Dataset/df_temp_cluster_Own_Loan_Amount_Condition.csv", inferSchema=True, header=True)

loans_df.printSchema()

[Stage 1:>                                                          (0 + 4) / 4]

root
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- Applicant Age: integer (nullable = true)
 |-- summary_income: double (nullable = true)
 |-- summary_income_cv: double (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- Cluster_result: integer (nullable = true)





In [5]:
# Load the serialized ALS model
model_path = "Recommendation_ALS_Model_OWN_LOAN_CLUSTER_STRATEGY"
loaded_model = ALSModel.load(model_path)
# Now you can use the loaded model for making recommendations or other tasks

In [8]:
predictions = loaded_model.transform(loans_df)

In [9]:
predictions.filter(predictions.userId == 10).select("userId", "loanId", "count", "Cluster_result", "prediction").show(20)

[Stage 7:>                                                          (0 + 1) / 1]                                                                                

+------+------+-----+--------------+----------+
|userId|loanId|count|Cluster_result|prediction|
+------+------+-----+--------------+----------+
|    10|   127|    4|             2| 3.9539769|
|    10|214846|    4|             2| 3.9539769|
|    10|196969|    4|             2| 3.9539769|
|    10|177796|    4|             2| 3.9539769|
|    10|160635|    4|             2| 3.9539769|
|    10|120247|    4|             2| 3.9539769|
|    10| 10615|    4|             2| 3.9539769|
|    10|  9109|    4|             2| 3.9539769|
|    10|  4050|    4|             2| 3.9539769|
|    10|  2535|    4|             2| 3.9539769|
|    10|   932|    4|             2| 3.9539769|
|    10|  6663|    4|             3|  3.955786|
|    10|150192|    4|             3|  3.955786|
|    10| 24267|    4|             3|  3.955786|
|    10|  7145|    4|             3|  3.955786|
|    10|  3204|    4|             4| 3.9575906|
|    10|341925|    4|             4| 3.9575906|
|    10|137071|    4|             4| 3.9

In [10]:
# Evaluate the model using Root Mean Squared Error (RMSE)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) = " + str(rmse))

[Stage 11:>                                                         (0 + 4) / 4]

Root Mean Squared Error (RMSE) = 0.021331284894671582




In [12]:
userRecommends = loaded_model.recommendForAllUsers(5)
loanRecommends = loaded_model.recommendForAllItems(5)

In [13]:
userRecommends.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Cluster_result: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [21]:
user_loans = loans_df.groupby("userId").agg(F.collect_set("Cluster_result").alias("loans"))

In [22]:
Actual_loans_taken = user_loans.toPandas().to_dict('records')

                                                                                

## COSINE SIMILARITY

In [15]:
userFactors = loaded_model.userFactors
itemFactors = loaded_model.itemFactors

In [16]:
# Initially the code is like this !!
def cosine_similarity(u1, u2):
    """Calculates cosine similarity between two user factors."""
    # Convert lists to DenseVectors before performing operations
    u1_vector = Vectors.dense(u1)
    u2_vector = Vectors.dense(u2)
    return u1_vector.dot(u2_vector) / (u1_vector.norm(p=2) * u2_vector.norm(p=2))

### Cosine Similarity of all users with all loans

In [17]:
# Collect all user factors into a dictionary
user_factors_dict = userFactors.select("id", "features").rdd.collectAsMap()

# Broadcast the user factors dictionary to all workers
broadcast_user_factors = spark.sparkContext.broadcast(user_factors_dict)

# Collect all item factors into a dictionary
item_factors_dict = itemFactors.select("id", "features").rdd.collectAsMap()

# Broadcast the item factors dictionary to all workers
broadcast_item_factors = spark.sparkContext.broadcast(item_factors_dict)

In [18]:
# Initialize list to store recommendations
CosineSilty_first10 = []
starttime = time.time()
Nusers = 25000

# Iterate over users
for userId in range(1, Nusers):
    user_factors = broadcast_user_factors.value.get(userId)
    if user_factors:
        TopCosineLoanforuser10 = []
        # Iterate over items
        for item_id, item_factors in broadcast_item_factors.value.items():
            user_item_similarity = cosine_similarity(user_factors, item_factors)
            TopCosineLoanforuser10.append({item_id: user_item_similarity})

        # Sort recommendations based on similarity
        Recomnd_loans = sorted(TopCosineLoanforuser10, key=lambda x: list(x.values())[0], reverse=True)
        CosineSilty_first10.append({"UserId": userId, "Recommendation": Recomnd_loans})

endtime = time.time()
runtime_hours = (endtime - starttime) / 3600
print(f"The time taken to run code for {Nusers} users is : {runtime_hours} hours")

The time taken to run code for 25000 users is : 0.0030799488888846502 hours


In [19]:
# Define a function to extract top-N recommendations
def extract_top_n_recommendations(recommendations_data, n=2500):
    top_n_recommendations = [{'userId': user_recommendation['UserId'],
                              'Recommendation': [list(rec.keys())[0] for rec in user_recommendation['Recommendation'][:n]]}
                             for user_recommendation in recommendations_data]
    return top_n_recommendations

# Extract top-N recommendations
starttime = time.time()
top_n_recommendations = extract_top_n_recommendations(CosineSilty_first10, n=5)
endtime = time.time()
runtime_hours = (endtime - starttime) / 3600


In [23]:
# Function to recommend top N loans not taken by the user
def recommend_loans_not_taken(user_id, recommendations, actual_loans_taken, top_n):
    taken_loans = set(actual_loans_taken.get(user_id, []))
    recommended_loans = []
    for rec in recommendations:
        loan_id = list(rec.keys())[0]
        if loan_id not in taken_loans:
            recommended_loans.append(loan_id)
            if len(recommended_loans) == top_n:
                break
    return recommended_loans

# Define a dictionary to store actual loans taken by each user
actual_loans_taken_dict = {record['userId']: record['loans'] for record in Actual_loans_taken}

# Create a list to store recommendations for each user
recommendations_for_users = []

# Iterate over each user
for rec_user in CosineSilty_first10:
    user_id = rec_user['UserId']
    recommendations = rec_user['Recommendation']
    # Recommend top N loans not taken by the user
    recommended_loans = recommend_loans_not_taken(user_id, recommendations, actual_loans_taken_dict, top_n=5)
    recommendations_for_users.append({'UserId': user_id, 'Recommendations': recommended_loans})



In [25]:
userRecommends = loaded_model.recommendForAllUsers(5)
loanRecommends = loaded_model.recommendForAllItems(5)

In [26]:
userRecommends.select(["userId","recommendations.Cluster_result","recommendations.rating"]).show(10,truncate=False)

24/03/12 16:04:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

+------+--------------------+---------------------------------------------------------+
|userId|Cluster_result      |rating                                                   |
+------+--------------------+---------------------------------------------------------+
|1     |[33, 1, 37, 30, 38] |[3.4242477, 3.1129024, 2.9915998, 2.9827292, 2.9775407]  |
|3     |[33, 1, 37, 16, 18] |[1.1747626, 1.0520585, 0.9929635, 0.9924082, 0.9914246]  |
|5     |[33, 1, 37, 16, 27] |[2.3101795, 2.087025, 1.991659, 1.9850262, 1.9831308]    |
|6     |[33, 1, 37, 16, 27] |[1.132987, 1.0258465, 0.9933879, 0.99183655, 0.99106026] |
|12    |[33, 1, 16, 27, 22] |[2.2909837, 2.0677474, 1.9845971, 1.9833524, 1.9828582]  |
|13    |[33, 1, 16, 18, 19] |[1.166576, 1.0476619, 0.99247843, 0.99130213, 0.9910941] |
|16    |[33, 38, 37, 16, 27]|[1.0991187, 1.0034589, 0.99002415, 0.9896106, 0.98951143]|
|19    |[33, 1, 16, 27, 22] |[2.2992384, 2.066215, 1.9844755, 1.9823893, 1.98212]     |
|20    |[33, 1, 16, 27, 19] |[1.

                                                                                

In [29]:
actual = [listo for listo in actual_loans_taken_dict.values()]

In [30]:
predicted = [listo['Recommendation'] for listo in top_n_recommendations]

In [27]:
def average_precision_at_k(actual, predicted, k):
    # Initialize variables
    num_hits = 0
    precision_at_k = 0.0

    # Iterate over the predicted list up to k
    for i, p in enumerate(predicted[:k]):
        # Check if the predicted item at position i is in the actual list
        if p in actual:
            num_hits += 1
            # Calculate precision at position i
            precision_at_k += num_hits / (i + 1)

    # Calculate average precision at k
    if not actual:
        return 0.0
    else:
        return precision_at_k / min(len(actual), k)

def map_at_k(actual, predicted, k):
    # Calculate average precision at k for each query or user
    average_precisions = [average_precision_at_k(a, p, k) for a, p in zip(actual, predicted)]
    
    # Return the mean of average precision at k values
    return sum(average_precisions) / len(average_precisions)

In [31]:
k = 5
map_at_5 = map_at_k(actual, predicted, k)
print("MAP@{}: {:.4f}".format(k, map_at_5))

MAP@5: 0.2166
