In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, count #countDistinct
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors  ##DenseMatrix
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import operator
import time

In [2]:
from pyspark.ml.recommendation import ALSModel

In [3]:
spark = SparkSession.builder.appName('loan_recommendation_with_clusters').config("spark.driver.memory", "12g").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 10:50:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/14 10:50:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:
# loans_df = spark.read.csv("RecommendData/df_temp_cluster_rating_condition_data.csv", inferSchema=True, header=True)
loans_df = spark.read.csv("../Loan_Dataset/df_temp_cluster_Own_Loan_Amount_Condition.csv", inferSchema=True, header=True)

loans_df.printSchema()

[Stage 1:>                                                          (0 + 4) / 4]

root
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- Applicant Age: integer (nullable = true)
 |-- summary_income: double (nullable = true)
 |-- summary_income_cv: double (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- Cluster_result: integer (nullable = true)





In [11]:
# Load the serialized ALS model
model_path = "../Recommendation_ALS_Model_OWN_LOAN_CLUSTER_STRATEGY"
loaded_model = ALSModel.load(model_path)
# Now you can use the loaded model for making recommendations or other tasks

In [12]:
predictions = loaded_model.transform(loans_df)

In [13]:
predictions.filter(predictions.userId == 10).select("userId", "loanId", "count", "Cluster_result", "prediction").show(20)


+------+------+-----+--------------+----------+
|userId|loanId|count|Cluster_result|prediction|
+------+------+-----+--------------+----------+
|    10|   127|    4|             2| 3.9539769|
|    10|214846|    4|             2| 3.9539769|
|    10|196969|    4|             2| 3.9539769|
|    10|177796|    4|             2| 3.9539769|
|    10|160635|    4|             2| 3.9539769|
|    10|120247|    4|             2| 3.9539769|
|    10| 10615|    4|             2| 3.9539769|
|    10|  9109|    4|             2| 3.9539769|
|    10|  4050|    4|             2| 3.9539769|
|    10|  2535|    4|             2| 3.9539769|
|    10|   932|    4|             2| 3.9539769|
|    10|  6663|    4|             3|  3.955786|
|    10|150192|    4|             3|  3.955786|
|    10| 24267|    4|             3|  3.955786|
|    10|  7145|    4|             3|  3.955786|
|    10|  3204|    4|             4| 3.9575906|
|    10|341925|    4|             4| 3.9575906|
|    10|137071|    4|             4| 3.9

In [14]:
# Evaluate the model using Root Mean Squared Error (RMSE)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) = " + str(rmse))

[Stage 11:>                                                         (0 + 4) / 4]

Root Mean Squared Error (RMSE) = 0.021331284894671582




In [17]:
userRecommends = loaded_model.recommendForAllUsers(5)
loanRecommends = loaded_model.recommendForAllItems(5)

In [18]:
userRecommends.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Cluster_result: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [19]:
user_loans = loans_df.groupby("userId").agg(F.collect_set("Cluster_result").alias("loans"))

In [20]:
Actual_loans_taken = user_loans.toPandas().to_dict('records')

                                                                                

## COSINE SIMILARITY

In [21]:
userFactors = loaded_model.userFactors
itemFactors = loaded_model.itemFactors

In [22]:
# Initially the code is like this !!
def cosine_similarity(u1, u2):
    """Calculates cosine similarity between two user factors."""
    # Convert lists to DenseVectors before performing operations
    u1_vector = Vectors.dense(u1)
    u2_vector = Vectors.dense(u2)
    return u1_vector.dot(u2_vector) / (u1_vector.norm(p=2) * u2_vector.norm(p=2))

### Cosine Similarity of all users with all loans

In [23]:
# Collect all user factors into a dictionary
user_factors_dict = userFactors.select("id", "features").rdd.collectAsMap()

# Broadcast the user factors dictionary to all workers
broadcast_user_factors = spark.sparkContext.broadcast(user_factors_dict)

# Collect all item factors into a dictionary
item_factors_dict = itemFactors.select("id", "features").rdd.collectAsMap()

# Broadcast the item factors dictionary to all workers
broadcast_item_factors = spark.sparkContext.broadcast(item_factors_dict)

In [24]:
# Initialize list to store recommendations
CosineSilty_first10 = []
starttime = time.time()
Nusers = 25000

# Iterate over users
for userId in range(1, Nusers):
    user_factors = broadcast_user_factors.value.get(userId)
    if user_factors:
        TopCosineLoanforuser10 = []
        # Iterate over items
        for item_id, item_factors in broadcast_item_factors.value.items():
            user_item_similarity = cosine_similarity(user_factors, item_factors)
            TopCosineLoanforuser10.append({item_id: user_item_similarity})

        # Sort recommendations based on similarity
        Recomnd_loans = sorted(TopCosineLoanforuser10, key=lambda x: list(x.values())[0], reverse=True)
        CosineSilty_first10.append({"UserId": userId, "Recommendation": Recomnd_loans})

endtime = time.time()
runtime_hours = (endtime - starttime) / 3600
print(f"The time taken to run code for {Nusers} users is : {runtime_hours} hours")

The time taken to run code for 25000 users is : 0.003121684922112359 hours


In [25]:
# Define a function to extract top-N recommendations
def extract_top_n_recommendations(recommendations_data, n=2500):
    top_n_recommendations = [{'userId': user_recommendation['UserId'],
                              'Recommendation': [list(rec.keys())[0] for rec in user_recommendation['Recommendation'][:n]]}
                             for user_recommendation in recommendations_data]
    return top_n_recommendations

# Extract top-N recommendations
starttime = time.time()
top_n_recommendations = extract_top_n_recommendations(CosineSilty_first10, n=5)
endtime = time.time()
runtime_hours = (endtime - starttime) / 3600


In [33]:
top_n_recommendations

[{'userId': 1, 'Recommendation': [15, 14, 19, 20, 12]},
 {'userId': 2, 'Recommendation': [6, 3, 14, 10, 8]},
 {'userId': 3, 'Recommendation': [6, 8, 3, 4, 2]},
 {'userId': 4, 'Recommendation': [2, 3, 4, 5, 7]},
 {'userId': 5, 'Recommendation': [6, 8, 3, 4, 7]},
 {'userId': 6, 'Recommendation': [14, 6, 12, 8, 15]},
 {'userId': 7, 'Recommendation': [6, 4, 8, 3, 2]},
 {'userId': 8, 'Recommendation': [15, 14, 19, 13, 18]},
 {'userId': 9, 'Recommendation': [6, 3, 4, 7, 2]},
 {'userId': 10, 'Recommendation': [4, 3, 2, 6, 5]},
 {'userId': 11, 'Recommendation': [6, 3, 4, 2, 8]},
 {'userId': 12, 'Recommendation': [6, 14, 8, 4, 3]},
 {'userId': 13, 'Recommendation': [8, 4, 6, 3, 2]},
 {'userId': 14, 'Recommendation': [15, 14, 10, 20, 19]},
 {'userId': 15, 'Recommendation': [2, 3, 4, 5, 6]},
 {'userId': 16, 'Recommendation': [16, 14, 15, 19, 12]},
 {'userId': 17, 'Recommendation': [6, 3, 4, 2, 7]},
 {'userId': 18, 'Recommendation': [8, 4, 6, 3, 7]},
 {'userId': 19, 'Recommendation': [8, 6, 4, 3, 

In [26]:
# Function to recommend top N loans not taken by the user
def recommend_loans_not_taken(user_id, recommendations, actual_loans_taken, top_n):
    taken_loans = set(actual_loans_taken.get(user_id, []))
    recommended_loans = []
    for rec in recommendations:
        loan_id = list(rec.keys())[0]
        if loan_id not in taken_loans:
            recommended_loans.append(loan_id)
            if len(recommended_loans) == top_n:
                break
    return recommended_loans

# Define a dictionary to store actual loans taken by each user
actual_loans_taken_dict = {record['userId']: record['loans'] for record in Actual_loans_taken}

# Create a list to store recommendations for each user
recommendations_for_users = []

# Iterate over each user
for rec_user in CosineSilty_first10:
    user_id = rec_user['UserId']
    recommendations = rec_user['Recommendation']
    # Recommend top N loans not taken by the user
    recommended_loans = recommend_loans_not_taken(user_id, recommendations, actual_loans_taken_dict, top_n=5)
    recommendations_for_users.append({'UserId': user_id, 'Recommendations': recommended_loans})



In [34]:
actual_loans_taken_dict

{1: [15, 12, 30, 16, 24, 21, 18, 14, 8],
 3: [6, 10, 25, 4, 8],
 5: [16, 6, 3, 10, 8],
 6: [12, 16, 8],
 9: [6, 7],
 12: [16, 2, 17, 6, 21, 14, 8],
 13: [2, 6, 25, 4, 8],
 15: [5, 2, 6, 3, 10, 25, 4, 8],
 16: [12, 16, 25, 8],
 17: [12, 16, 5, 2, 20, 6, 3, 10, 4, 11, 8],
 19: [16, 2, 18, 10, 4, 14, 8],
 20: [8],
 22: [5, 6, 3],
 26: [18, 10, 8],
 27: [5, 2, 6, 8],
 28: [5, 10, 14, 8],
 31: [30, 16, 24, 3, 18, 21, 10, 4, 14],
 34: [15, 12, 19, 6, 3, 18, 24, 10, 7, 4, 14, 11, 8],
 37: [38, 10, 25, 4, 11, 26, 13, 5, 20, 35, 6, 7, 14, 8, 23],
 40: [16, 10, 14],
 41: [27, 16, 38, 32, 21, 14],
 43: [30, 16, 38, 24, 21, 36, 23],
 44: [6, 25, 8],
 47: [16, 13, 5, 6, 8],
 48: [9, 2, 6, 4, 8],
 52: [12, 3, 10, 25, 8],
 53: [15, 6, 18, 10, 14, 8],
 54: [12, 5, 2, 6, 3, 10, 7, 4, 14, 8],
 57: [15, 6, 21, 18, 10, 25, 14, 8],
 61: [2, 6, 10],
 64: [5, 2, 3],
 65: [15, 2, 6, 10, 8],
 72: [16, 13, 6, 4, 8],
 76: [16, 6, 21, 10, 22, 26],
 78: [13, 2, 10, 4, 8],
 81: [5, 2, 6, 3, 4],
 85: [15, 12, 16, 17

In [35]:
recommendations_for_users

[{'UserId': 1, 'Recommendations': [19, 20, 10, 13, 6]},
 {'UserId': 2, 'Recommendations': [3, 14, 2, 7, 15]},
 {'UserId': 3, 'Recommendations': [3, 2, 5, 7, 14]},
 {'UserId': 4, 'Recommendations': [5, 7, 6, 8, 9]},
 {'UserId': 5, 'Recommendations': [4, 7, 2, 14, 5]},
 {'UserId': 6, 'Recommendations': [14, 6, 15, 4, 20]},
 {'UserId': 7, 'Recommendations': [4, 8, 3, 2, 7]},
 {'UserId': 8, 'Recommendations': [14, 19, 22, 10, 17]},
 {'UserId': 9, 'Recommendations': [3, 4, 2, 8, 5]},
 {'UserId': 10, 'Recommendations': [7, 9, 10, 14, 20]},
 {'UserId': 11, 'Recommendations': [7, 5, 9, 20, 11]},
 {'UserId': 12, 'Recommendations': [4, 3, 7, 20, 15]},
 {'UserId': 13, 'Recommendations': [3, 5, 7, 14, 9]},
 {'UserId': 14, 'Recommendations': [15, 14, 20, 19, 12]},
 {'UserId': 15, 'Recommendations': [7, 9, 14, 20, 11]},
 {'UserId': 16, 'Recommendations': [14, 15, 19, 20, 22]},
 {'UserId': 17, 'Recommendations': [7, 14, 9, 15, 17]},
 {'UserId': 18, 'Recommendations': [4, 3, 7, 5, 14]},
 {'UserId': 19

In [27]:
userRecommends = loaded_model.recommendForAllUsers(5)
loanRecommends = loaded_model.recommendForAllItems(5)

In [28]:
userRecommends.select(["userId","recommendations.Cluster_result","recommendations.rating"]).show(10,truncate=False)

24/03/14 10:57:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

+------+--------------------+---------------------------------------------------------+
|userId|Cluster_result      |rating                                                   |
+------+--------------------+---------------------------------------------------------+
|1     |[33, 1, 37, 30, 38] |[3.4242477, 3.1129024, 2.9915998, 2.9827292, 2.9775407]  |
|3     |[33, 1, 37, 16, 18] |[1.1747626, 1.0520585, 0.9929635, 0.9924082, 0.9914246]  |
|5     |[33, 1, 37, 16, 27] |[2.3101795, 2.087025, 1.991659, 1.9850262, 1.9831308]    |
|6     |[33, 1, 37, 16, 27] |[1.132987, 1.0258465, 0.9933879, 0.99183655, 0.99106026] |
|12    |[33, 1, 16, 27, 22] |[2.2909837, 2.0677474, 1.9845971, 1.9833524, 1.9828582]  |
|13    |[33, 1, 16, 18, 19] |[1.166576, 1.0476619, 0.99247843, 0.99130213, 0.9910941] |
|16    |[33, 38, 37, 16, 27]|[1.0991187, 1.0034589, 0.99002415, 0.9896106, 0.98951143]|
|19    |[33, 1, 16, 27, 22] |[2.2992384, 2.066215, 1.9844755, 1.9823893, 1.98212]     |
|20    |[33, 1, 16, 27, 19] |[1.

                                                                                

In [29]:
actual = [listo for listo in actual_loans_taken_dict.values()]

In [30]:
predicted = [listo['Recommendation'] for listo in top_n_recommendations]

In [31]:
def average_precision_at_k(actual, predicted, k):
    # Initialize variables
    num_hits = 0
    precision_at_k = 0.0

    # Iterate over the predicted list up to k
    for i, p in enumerate(predicted[:k]):
        # Check if the predicted item at position i is in the actual list
        if p in actual:
            num_hits += 1
            # Calculate precision at position i
            precision_at_k += num_hits / (i + 1)

    # Calculate average precision at k
    if not actual:
        return 0.0
    else:
        return precision_at_k / min(len(actual), k)

def map_at_k(actual, predicted, k):
    # Calculate average precision at k for each query or user
    average_precisions = [average_precision_at_k(a, p, k) for a, p in zip(actual, predicted)]
    
    # Return the mean of average precision at k values
    return sum(average_precisions) / len(average_precisions)

In [32]:
k = 5
map_at_5 = map_at_k(actual, predicted, k)
print("MAP@{}: {:.4f}".format(k, map_at_5))

MAP@5: 0.2166
