In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, countDistinct, count
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
import pandas as pd
import time

In [13]:
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder.appName('loan_recommendation_with_clusters').config("spark.driver.memory", "15g").getOrCreate()

In [3]:
spark

In [23]:
loans_df = spark.read.csv("Loan_Dataset/df_temp_cluster.csv", inferSchema=True, header=True)
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [24]:
loans_df.select(["LoanIdFormat","Amount","Total_Repayments__c","Payment_Frequency__c","count"]).show(10)

+------------+------+-------------------+--------------------+-----+
|LoanIdFormat|Amount|Total_Repayments__c|Payment_Frequency__c|count|
+------------+------+-------------------+--------------------+-----+
|           1|  2000|                  5|         Fortnightly|    0|
|           1|  2050|                  8|         Fortnightly|    0|
|           1|  2050|                  8|         Fortnightly|    2|
|           1|  2500|                 10|         Fortnightly|    2|
|           1|  1600|                  7|         Fortnightly|    4|
|           1|  2000|                  3|         Fortnightly|   56|
|           1|  2050|                  8|         Fortnightly|   53|
|           1|  1500|                  7|         Fortnightly|   44|
|           1|  1300|                  7|         Fortnightly|   46|
|           1|  1500|                 10|         Fortnightly|   45|
+------------+------+-------------------+--------------------+-----+
only showing top 10 rows



In [25]:
loans_df.select(["LoanIdFormat","count","Amount","Total_Repayments__c","Number_Of_Loans_Granted__c","Num_Of_Loans_Paid__c"]).describe().show()

+-------+------------------+------------------+------------------+-------------------+--------------------------+--------------------+
|summary|      LoanIdFormat|             count|            Amount|Total_Repayments__c|Number_Of_Loans_Granted__c|Num_Of_Loans_Paid__c|
+-------+------------------+------------------+------------------+-------------------+--------------------------+--------------------+
|  count|              5010|              5010|              5010|               5010|                      5010|                5010|
|   mean| 8.197604790419161|16.642115768463075|1337.9241516966067| 10.216167664670659|        22.779041916167664|  19.710578842315368|
| stddev|7.5725066595349535|11.943178454402434|1060.9889851020196|  5.706869110532655|        12.770361468751108|  12.253513797420753|
|    min|                 1|                -4|               200|                  3|                         6|                   4|
|    max|                44|                69|        

In [26]:
loans_df.show(vertical=True)

-RECORD 0------------------------------------------
 Contact__c                 | 0032x00000TC3rzAAD   
 Min_IT_Loan_ID__c          | 847887               
 Opp_Number__c              | 4548477              
 Id                         | 0062x00000D8l3XAAR   
 AccountID                  | 0012x00000cx1cOAAQ   
 Number_Of_Loans_Granted__c | 10                   
 Num_Of_Loans_Paid__c       | 5                    
 Purpose_of_Loan__c         | Travel Expenses      
 Total_Repayments__c        | 5                    
 Amount                     | 2000                 
 Term_in_Weeks__c           | 8.857                
 Payment_Frequency__c       | Fortnightly          
 StageName                  | Loan Paid            
 userId                     | 884                  
 loanId                     | 116                  
 count                      | 0                    
 LoanIdFormat               | 1                    
-RECORD 1------------------------------------------
 Contact__c 

In [8]:
# Count the number of unique items
num_unique_loan_id_format = loans_df.select('LoanIdFormat').distinct().count()
print(f"Number of unique loanId_format: {num_unique_loan_id_format}")

# Count the number of unique users
num_unique_id = loans_df.select('Id').distinct().count()
print(f"Number of unique Loan Id: {num_unique_id}")

Number of unique loanId_format: 27
Number of unique Loan Id: 4571


In [9]:
# Count the number of unique items
tmp = loans_df.select('userId').distinct().count()
print(f"Number of unique users: {tmp}")

# Count the number of unique users
# tmp = ratings_df.select('movieId').distinct().count()
# print(f"Number of unique movies: {tmp}")

Number of unique users: 2864


In [10]:
# distinct values of the rating
loans_df.select("count").distinct().show()

+-----+
|count|
+-----+
|   31|
|   65|
|   53|
|   34|
|   -1|
|   28|
|   27|
|   26|
|   44|
|   12|
|   22|
|   47|
|    1|
|   52|
|   13|
|   16|
|    6|
|    3|
|   40|
|   20|
+-----+
only showing top 20 rows



========================================================================================================

Converting the dataset into train test and validation split.

In [11]:
# sc = spark.sparkContext

In [12]:
loans_df.select(["LoanIdFormat","count","Amount"]).show(10)

+------------+-----+------+
|LoanIdFormat|count|Amount|
+------------+-----+------+
|           1|    0|  2000|
|           1|    0|  2050|
|           1|    2|  2050|
|           1|    2|  2500|
|           1|    4|  1600|
|           1|   56|  2000|
|           1|   53|  2050|
|           1|   44|  1500|
|           1|   46|  1300|
|           1|   45|  1500|
+------------+-----+------+
only showing top 10 rows



In [14]:
train, validation = loans_df.randomSplit([0.8, 0.2])
# train, validation, test = loans_df.randomSplit([0.8, 0.1, 0.1])
# # cache data
train.cache()
validation.cache()
# test.cache()

DataFrame[Contact__c: string, Min_IT_Loan_ID__c: int, Opp_Number__c: int, Id: string, AccountID: string, Number_Of_Loans_Granted__c: int, Num_Of_Loans_Paid__c: int, Purpose_of_Loan__c: string, Total_Repayments__c: int, Amount: int, Term_in_Weeks__c: double, Payment_Frequency__c: string, StageName: string, userId: int, loanId: int, count: int, LoanIdFormat: int]

In [16]:
validation.select(["userId","loanId","LoanIdFormat","count","Amount","Total_Repayments__c"]).show()

+------+------+------------+-----+------+-------------------+
|userId|loanId|LoanIdFormat|count|Amount|Total_Repayments__c|
+------+------+------------+-----+------+-------------------+
|  2030|  2807|           5|   30|   300|                 10|
|  1184|  2785|           4|   19|  1000|                  9|
|   170|  1888|           1|   21|  2050|                  8|
|    34|  3703|           1|   25|  2050|                  8|
|  2071|  2408|          17|   23|  1500|                 16|
|  1044|   214|           4|   -4|   500|                  5|
|  2773|  4212|          21|   23|  3000|                  5|
|   796|  1345|          17|   17|  2050|                 16|
|   990|  3863|          17|   32|  2050|                 16|
|   990|  3832|           8|   34|  1000|                 14|
|   990|  2384|          14|   38|  2050|                 26|
|   791|   766|          29|   19|  5000|                 17|
|  1775|  1914|           8|    0|   400|                 14|
|   326|

In [14]:
loans_df.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)



In [15]:
def train_ALS(train_data, validation_data, num_iters, reg_param, ranks):
    """
    Grid Search Function to select the best model based on RMSE of hold-out data
    """
    # initial
    min_error = float('inf')
    best_rank = -1
    best_regularization = 0
    best_model = None
    for rank in ranks:
        for reg in reg_param:
            # Train the model using the training data
            als_model = ALS(maxIter=num_iters,regParam=reg, rank=rank,
                                        userCol='userId', itemCol='loanId', ratingCol='count', seed=99)
            model = als_model.fit(train_data)
            
            # Generate predictions on the test data
            predictions = model.transform(validation_data)
            predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 65 THEN 65 ELSE prediction END"))
            
            evaluator = RegressionEvaluator(metricName='rmse', labelCol='count', predictionCol='prediction')
            error = evaluator.evaluate(predictions)
            
            print('{} latent factors and regularization = {}: validation RMSE is {}'.format(rank, reg, error))
            if error < min_error:
                min_error = error
                best_rank = rank
                best_regularization = reg
                best_model = model
    print('\nThe best model has {} latent factors and regularization = {}'.format(best_rank, best_regularization))
    return best_model

In [16]:
# hyper-param config
num_iterations = 10
ranks = [8, 10, 20, 40, 60]
reg_params = [0.001, 0.01, 0.05, 0.1, 0.2]

# grid search and select best model
start_time = time.time()
final_model = train_ALS(train, validation, num_iterations, reg_params, ranks)

print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

8 latent factors and regularization = 0.001: validation RMSE is 46.81493100431321
8 latent factors and regularization = 0.01: validation RMSE is 46.81493101264122
8 latent factors and regularization = 0.05: validation RMSE is 46.814931448667615
8 latent factors and regularization = 0.1: validation RMSE is 46.814933848323875
8 latent factors and regularization = 0.2: validation RMSE is 46.81494850973385
10 latent factors and regularization = 0.001: validation RMSE is 46.814931004313195
10 latent factors and regularization = 0.01: validation RMSE is 46.81493101264118
10 latent factors and regularization = 0.05: validation RMSE is 46.814931448669604
10 latent factors and regularization = 0.1: validation RMSE is 46.814933848322106
10 latent factors and regularization = 0.2: validation RMSE is 46.81494850974927
20 latent factors and regularization = 0.001: validation RMSE is 46.81493100431318
20 latent factors and regularization = 0.01: validation RMSE is 46.81493101264079
20 latent factors

In [20]:
# 40 latent factors and regularization = 0.001

In [31]:
predictions = final_model.transform(validation)
# predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 65 THEN 65 ELSE prediction END"))

In [32]:
predictions.printSchema()

root
 |-- Contact__c: string (nullable = true)
 |-- Min_IT_Loan_ID__c: integer (nullable = true)
 |-- Opp_Number__c: integer (nullable = true)
 |-- Id: string (nullable = true)
 |-- AccountID: string (nullable = true)
 |-- Number_Of_Loans_Granted__c: integer (nullable = true)
 |-- Num_Of_Loans_Paid__c: integer (nullable = true)
 |-- Purpose_of_Loan__c: string (nullable = true)
 |-- Total_Repayments__c: integer (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Term_in_Weeks__c: double (nullable = true)
 |-- Payment_Frequency__c: string (nullable = true)
 |-- StageName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- loanId: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- LoanIdFormat: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [33]:
predictions.select(["LoanIdFormat","loanId","userId","Amount","Total_Repayments__c","count","prediction"]).show(11)

+------------+------+------+------+-------------------+-----+----------+
|LoanIdFormat|loanId|userId|Amount|Total_Repayments__c|count|prediction|
+------------+------+------+------+-------------------+-----+----------+
|          17|  2748|   990|  1500|                 14|   30|       NaN|
|           2|  1496|  2071|  1750|                  7|   17|       NaN|
|           5|  1628|    33|  1000|                 10|   26| 25.999958|
|          17|  1991|   326|  2050|                 16|   35|       NaN|
|           1|  2298|  1926|  2500|                  8|   25|       NaN|
|          38|   130|  2837|  3000|                 24|   19|       NaN|
|           5|  2807|  2030|   300|                 10|   30|       NaN|
|           8|  2807|  2030|   300|                 10|   30|       NaN|
|           1|  3455|  1114|  2050|                  8|   28|       NaN|
|           1|  3113|    34|  2050|                  8|   27| 26.999956|
|          17|  2897|   326|  2050|                

In [34]:
predictions.groupBy(predictions.prediction).count().sort(predictions.prediction).show(50)

+----------+-----+
|prediction|count|
+----------+-----+
|  0.998999|    2|
|0.99899906|    1|
| 1.9994923|    1|
| 1.9994924|    1|
| 2.9993901|    1|
| 2.9993904|    1|
| 2.9996605|    1|
| 2.9996612|    1|
| 3.9997454|    1|
| 3.9997458|    1|
| 4.9997005|    1|
|  4.999796|    1|
|  4.999797|    1|
| 5.9998307|    1|
|  5.999888|    1|
| 5.9999084|    1|
| 7.9998727|    1|
|   8.99986|    1|
| 8.9998865|    1|
|  8.999887|    1|
|  9.999851|    1|
| 11.999915|    1|
| 12.999924|    1|
| 12.999947|    1|
| 15.999958|    1|
|  16.99995|    1|
| 20.999939|    2|
| 21.999952|    1|
| 21.999958|    1|
| 21.999964|    1|
| 23.999954|    1|
| 24.999935|    1|
| 24.999964|    1|
| 25.999958|    2|
| 25.999962|    1|
| 25.999964|    1|
| 26.999956|    1|
| 26.999958|    1|
| 28.999952|    1|
| 28.999962|    1|
| 28.999973|    1|
| 29.999954|    2|
| 29.999958|    1|
| 32.999966|    1|
| 34.999947|    1|
| 34.999958|    1|
|  35.99996|    1|
| 43.999977|    1|
| 47.999973|    1|
|       NaN|

In [38]:
validation.select("LoanIdFormat","Id","Amount").describe().show()

+-------+-----------------+------------------+------------------+
|summary|     LoanIdFormat|                Id|            Amount|
+-------+-----------------+------------------+------------------+
|  count|              468|               468|               468|
|   mean|8.352564102564102|              NULL|1425.2136752136753|
| stddev|7.453040269388324|              NULL|   1100.9509989081|
|    min|                1|0062x00000CRBlQAAX|               200|
|    max|               38|0062x00000FJxpLAAT|              5000|
+-------+-----------------+------------------+------------------+



In [21]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
error = evaluator.evaluate(predictions)

In [23]:
print(f"The RMSE for the above mnodel is : {error}")

The RMSE for the above mnodel is : 0.850732304490565


In [24]:
predictions.join(movies_df,"movieId").select("userId","title","prediction").show(10,truncate=False)

+------+---------------------------------+----------+
|userId|title                            |prediction|
+------+---------------------------------+----------+
|148   |NeverEnding Story III, The (1994)|1.0       |
|148   |Safe (1995)                      |2.24905   |
|148   |Body Snatchers (1993)            |1.9846423 |
|148   |Dave (1993)                      |2.358632  |
|148   |In the Line of Fire (1993)       |2.491284  |
|148   |Batman (1989)                    |2.3913064 |
|148   |My Fair Lady (1964)              |2.5859952 |
|148   |Roman Holiday (1953)             |2.7362595 |
|148   |Wizard of Oz, The (1939)         |2.747697  |
|148   |Gone with the Wind (1939)        |2.7169576 |
+------+---------------------------------+----------+
only showing top 10 rows



In [25]:
userRecommends = final_model.recommendForAllUsers(5)
movieRecommends = final_model.recommendForAllItems(5)

In [26]:
movieRecommends.printSchema()

root
 |-- movieId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- userId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [27]:
movieRecommends.select(["movieId","recommendations.userId","recommendations.rating"]).show(10)

+-------+--------------------+--------------------+
|movieId|              userId|              rating|
+-------+--------------------+--------------------+
|      1|[275, 6415, 5082,...|[5.1801147, 5.176...|
|     12|[2860, 540, 2624,...|[4.404322, 4.2132...|
|     13|[6285, 6195, 8787...|[4.942979, 4.8070...|
|     22|[7318, 3178, 6707...|[4.813835, 4.7732...|
|     26|[3208, 6707, 9768...|[4.539195, 4.4994...|
|     27|[1277, 6230, 5120...|[4.79809, 4.78447...|
|     28|[6707, 6066, 2072...|[4.9017997, 4.802...|
|     31|[1713, 4908, 3804...|[4.843358, 4.7032...|
|     34|[6195, 6628, 7094...|[5.0688567, 5.059...|
|     44|[7030, 7318, 4194...|[4.433797, 4.3552...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [28]:
userRecommends.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [30]:
userRecommends.select(["userId","recommendations.movieId","recommendations.rating"]).show(10,truncate=False)

+------+----------------------------------------+-------------------------------------------------------+
|userId|movieId                                 |rating                                                 |
+------+----------------------------------------+-------------------------------------------------------+
|1     |[176887, 92954, 192025, 192015, 134879] |[5.1432505, 5.060037, 5.035781, 5.035781, 5.035781]    |
|12    |[134236, 176887, 136495, 5365, 32139]   |[5.1536174, 5.1192183, 4.9322424, 4.932063, 4.8875637] |
|13    |[107361, 170799, 26424, 176887, 171715] |[6.030097, 5.835145, 5.792248, 5.7379646, 5.6894727]   |
|22    |[179297, 139327, 107361, 26424, 37257]  |[4.811372, 4.811372, 4.8074174, 4.7039056, 4.685463]   |
|26    |[179297, 139327, 26424, 66854, 107361]  |[5.3037553, 5.3037553, 5.2357116, 5.189636, 5.1757894] |
|27    |[107361, 179297, 139327, 192001, 171363]|[5.2407384, 5.149643, 5.149643, 5.013967, 5.013967]    |
|28    |[107361, 102323, 134236, 26424, 60201]

In [36]:
movieRecommends.filter(movieRecommends.movieId == 59315 ).show(vertical=True,truncate=False)

-RECORD 0---------------------------------------------------------------------------------------------------------
 movieId         | 59315                                                                                          
 recommendations | [{5668, 5.1048603}, {7034, 5.0878706}, {7251, 5.0698075}, {5424, 5.052721}, {5120, 5.0273542}] 



In [None]:
{5668, 5.1048603}, ==> White Oleander (2002)
{7034, 5.0878706}, ==> Show Me Love (Fucking Ã…mÃ¥l) (1998)
{7251, 5.0698075}, ==> Where the Day Takes You (1992)
{5424, 5.052721},  ==> Harvard Man (2001)
{5120, 5.0273542}  ==> Sleuth (1972)

In [None]:
# # For example, 30% of items will be masked
# percent_items_to_mask = 0.3 
# # Determine the number of items to mask for each user
# df_rec_final = df_rec_filtered.withColumn("num_items_to_mask", (col("num_items") * percent_items_to_mask).cast("int"))
# # Masks items for each user
# df_rec_final = df_rec_final.withColumn("item_rank", rank().over(user_window))

# # Create a StringIndexer model to index the user ID column
# indexer_user = StringIndexer(inputCol='userId', outputCol='userIndex').setHandleInvalid("keep")
# indexer_item = StringIndexer(inputCol='itemId', outputCol='itemIndex').setHandleInvalid("keep")

# # Fit the indexer model to the data and transform the DataFrame
# df_rec_final = indexer_user.fit(df_rec_final).transform(df_rec_final)
# df_rec_final = indexer_item.fit(df_rec_final).transform(df_rec_final)

# # Convert the userIndex column to integer type
# df_rec_final = df_rec_final.withColumn('userIndex', df_rec_final['userIndex'].cast('integer'))\
#                .withColumn('itemIndex', df_rec_final['itemIndex'].cast('integer'))

# train_df_rec = df_rec_final.filter(col("item_rank") > col("num_items_to_mask"))
# test_df_rec = df_rec_final.filter(col("item_rank") <= col("num_items_to_mask"))

In [9]:
training_df, validation_df = ratings_df.randomSplit([.8,.2])

In [12]:
# Configure the ALS model
als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)

param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [1, 4, 10, 20, 30])\
             .addGrid(als.maxIter, [10 ,12,18,20])\
             .addGrid(als.regParam, [0.001, 0.01, .05, .15])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

In [13]:
model = cv.fit(training_df)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

rank:  1
MaxIter:  20
RegParam:  0.01


In [16]:
# Train the model using the training data
als_using_best_params = ALS(maxIter = best_model._java_obj.parent().getMaxIter(),regParam = best_model._java_obj.parent().getRegParam(),rank = best_model.rank,
        userCol='userId', itemCol='movieId', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)
model = als_using_best_params.fit(training_df)

# Generate predictions on the test data
predictions = best_model.transform(validation_df)
predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 5 THEN 5 ELSE prediction END"))

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 0.8744795298421134


In [34]:
predictions.show(10)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   322|   1580|   3.5|1217676294|  3.068567|
|   593|   1580|   1.5|1181007882|  3.054487|
|   597|   1580|   3.0| 941558308| 3.8594012|
|   597|   2366|   5.0| 941729029|  3.914689|
|   368|   2366|   4.0| 975828914| 2.9883285|
|   368|   3918|   2.0| 971273835| 3.6173298|
|   115|   1645|   4.0| 957648208| 3.6923604|
|   385|   1238|   3.0| 865026050|  4.028213|
|   183|   1580|   4.0| 992331667|  3.525297|
|   436|    471|   3.0| 833530187| 3.5021195|
+------+-------+------+----------+----------+
only showing top 10 rows



In [37]:
predictions.join(movies_df,"movieId").select("userId","title","prediction").show(10,truncate=False)

+------+--------------------------------+----------+
|userId|title                           |prediction|
+------+--------------------------------+----------+
|322   |Men in Black (a.k.a. MIB) (1997)|3.068567  |
|593   |Men in Black (a.k.a. MIB) (1997)|3.054487  |
|597   |Men in Black (a.k.a. MIB) (1997)|3.8594012 |
|597   |King Kong (1933)                |3.914689  |
|368   |King Kong (1933)                |2.9883285 |
|368   |Hellbound: Hellraiser II (1988) |3.6173298 |
|115   |The Devil's Advocate (1997)     |3.6923604 |
|385   |Local Hero (1983)               |4.028213  |
|183   |Men in Black (a.k.a. MIB) (1997)|3.525297  |
|436   |Hudsucker Proxy, The (1994)     |3.5021195 |
+------+--------------------------------+----------+
only showing top 10 rows



In [38]:
+

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   322|   1580|   3.5|1217676294|  3.068567|
|   593|   1580|   1.5|1181007882|  3.054487|
|   597|   1580|   3.0| 941558308| 3.8594012|
|   597|   2366|   5.0| 941729029|  3.914689|
|   368|   2366|   4.0| 975828914| 2.9883285|
|   368|   3918|   2.0| 971273835| 3.6173298|
|   115|   1645|   4.0| 957648208| 3.6923604|
|   385|   1238|   3.0| 865026050|  4.028213|
|   183|   1580|   4.0| 992331667|  3.525297|
|   436|    471|   3.0| 833530187| 3.5021195|
+------+-------+------+----------+----------+
only showing top 10 rows



In [27]:
userRecommends = model.recommendForAllUsers(5)
movieRecommends = model.recommendForAllItems(5)

In [30]:
userRecommends.select(["userId","recommendations"]).show(10,truncate=False)

+------+-----------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                |
+------+-----------------------------------------------------------------------------------------------+
|1     |[{6835, 10.826822}, {5746, 10.826822}, {5181, 10.826822}, {7899, 9.74414}, {5764, 9.74414}]    |
|2     |[{6835, 8.613483}, {5746, 8.613483}, {5181, 8.613483}, {7899, 7.752135}, {5764, 7.752135}]     |
|3     |[{6835, 4.9506826}, {5746, 4.9506826}, {5181, 4.9506826}, {7899, 4.4556146}, {5764, 4.4556146}]|
|4     |[{6835, 8.38581}, {5746, 8.38581}, {5181, 8.38581}, {7899, 7.547229}, {5764, 7.547229}]        |
|5     |[{6835, 8.882423}, {5746, 8.882423}, {5181, 8.882423}, {7899, 7.994181}, {5764, 7.994181}]     |
|6     |[{6835, 9.649032}, {5746, 9.649032}, {5181, 9.649032}, {7899, 8.684129}, {5764, 8.684129}]     |
|7     |[{6835, 8.051511}, {5746, 8.051511}, {5181, 8.0

In [33]:
movieRecommends.select(["movieId","recommendations.userId"]).show(10,truncate=False)

+-------+---------------------+
|movieId|userID               |
+-------+---------------------+
|1      |[53, 43, 276, 12, 93]|
|12     |[53, 43, 276, 12, 93]|
|13     |[53, 43, 276, 12, 93]|
|22     |[53, 43, 276, 12, 93]|
|26     |[53, 43, 276, 12, 93]|
|27     |[53, 43, 276, 12, 93]|
|28     |[53, 43, 276, 12, 93]|
|31     |[53, 43, 276, 12, 93]|
|34     |[53, 43, 276, 12, 93]|
|44     |[53, 43, 276, 12, 93]|
+-------+---------------------+
only showing top 10 rows

