# ALS Recommendation System

### Setting up

In [2]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import isnan, when, count, col, avg, first
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, BooleanType, DateType, FloatType
import pandas as pd
from functools import reduce
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.window import Window
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .master('yarn') \
  .appName('spark-bigquery-demo') \
  .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
bucket = "big-data-yelp"
spark.conf.set('temporaryGcsBucket', bucket)

### Loading Business

In [5]:
table = 'red-formula-339716:gfds.yelp_business_basicdata'
df_b = spark.read.format('bigquery').option('table', table).load()

### Flitering Restaurants and Food

In [6]:
df_r = df_b.filter(df_b.state.like("OH")).filter(df_b.categories.like("%Restaurants%")).drop('int64_field_0', 'categories','review_count', 'address','postal_code','latitude', 'longitude', 'stars', 'is_open')

In [7]:
del df_b

### Loading Reviews

In [8]:
table_re = 'red-formula-339716:gfds.yelp_review'
df_re = spark.read.format('bigquery').option('table', table_re).load()

In [9]:
df_re = df_re.drop('cool','funny','useful', 'date', 'review_id', 'compliment_count')

### Joining and Group Reviews by User_ID

In [10]:
dfj = df_r.join(df_re, ['business_id'], "inner").drop("text")

In [11]:
dfj.show(5)

                                                                                

+--------------------+--------------------+------------+-----+-----+--------------------+
|         business_id|                name|        city|state|stars|             user_id|
+--------------------+--------------------+------------+-----+-----+--------------------+
|N5yvv_q8h5omlj4x5...|      Lucky's Market|    Columbus|   OH|  5.0|4l5ZtYFm_5uISmcCX...|
|HfTfTRgaS1o2yAy2R...|Beer Barrel Pizza...|    Columbus|   OH|  1.0|fVzPWqOSA6YKt8GUD...|
|LRYPakb_X3u8ACIOs...|Buffalo Wings & R...|Lewis Center|   OH|  2.0|iRila5bK4DLSxFQqB...|
|k7VuAp0zAgoIKid1Y...|         Buckeye Pho|    Columbus|   OH|  4.0|SdMVWxstaq8vML3EB...|
|pfAhZeQf-TqxqG5Dx...|Eddie Merlot's - ...|    Columbus|   OH|  5.0|KrI7JN7ep4QKo9RCV...|
+--------------------+--------------------+------------+-----+-----+--------------------+
only showing top 5 rows



In [12]:
df1 = dfj.dropDuplicates()


In [13]:
df1.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- user_id: string (nullable = true)



In [14]:
df1=df1.drop('state')

##### Changing User_id and Business_id into integers (for ALS) 

In [15]:
df1 = df1.withColumn("business_id1", F.dense_rank().over(Window.orderBy(df1.business_id)))

In [16]:
df1.show(5)

22/03/12 00:17:44 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:17:44 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:17:45 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:00 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:04 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+--------------------+---------+--------+-----+--------------------+------------+
|         business_id|     name|    city|stars|             user_id|business_id1|
+--------------------+---------+--------+-----+--------------------+------------+
|--Q3mAcX9t63f7Xcb...|The Royce|Columbus|  4.0|c0uIYXfInsCHPZEov...|           1|
|--Q3mAcX9t63f7Xcb...|The Royce|Columbus|  5.0|xfEHeJKVrUQyfsUc3...|           1|
|--Q3mAcX9t63f7Xcb...|The Royce|Columbus|  5.0|CzV02GnQyducWdyp_...|           1|
|--Q3mAcX9t63f7Xcb...|The Royce|Columbus|  2.0|yG2fIcbl_v2-58q1c...|           1|
|--Q3mAcX9t63f7Xcb...|The Royce|Columbus|  5.0|FP9xU8cFzU9agsQBl...|           1|
+--------------------+---------+--------+-----+--------------------+------------+
only showing top 5 rows



                                                                                

In [17]:
df1 = df1.withColumn("user_id1", F.dense_rank().over(Window.orderBy(df1.user_id)))

In [18]:
df1.show(5)

22/03/12 00:18:07 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:07 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:07 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:11 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:11 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+--------------------+--------------------+--------+-----+--------------------+------------+--------+
|         business_id|                name|    city|stars|             user_id|business_id1|user_id1|
+--------------------+--------------------+--------+-----+--------------------+------------+--------+
|qa4SegtG2bWMBhJgW...|          Katalina's|Columbus|  5.0|--1_pDM1pQ26cqhLx...|        3724|       1|
|32AcG_zpsPzMgo0aW...|Stack City Burger...|Columbus|  4.0|--2PnhMMH7EYoY3wy...|         257|       2|
|8lS-sVYxXqVbhV8vj...|     Hong Kong House|Columbus|  4.0|--2PnhMMH7EYoY3wy...|         656|       2|
|AEzIqFtXrJITE4toG...|   Mark Pi's Express|Columbus|  3.0|--2PnhMMH7EYoY3wy...|         758|       2|
|IHCD--427ou0ODW6J...|          Brazenhead|  Dublin|  4.0|--2PnhMMH7EYoY3wy...|        1281|       2|
+--------------------+--------------------+--------+-----+--------------------+------------+--------+
only showing top 5 rows



                                                                                

##### Final dataframe for ALS 

In [19]:
df_final = df1.drop('business_id','user_id','name','city')

In [20]:
df_final = df_final.withColumnRenamed("business_id1","business_id")
df_final = df_final.withColumnRenamed("user_id1","user_id")

In [21]:
df_final.show(5)

22/03/12 00:18:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+
|stars|business_id|user_id|
+-----+-----------+-------+
|  5.0|       3724|      1|
|  4.0|        257|      2|
|  4.0|        656|      2|
|  3.0|        758|      2|
|  4.0|       1281|      2|
+-----+-----------+-------+
only showing top 5 rows



                                                                                

In [22]:
df_final = df_final.dropDuplicates()

In [23]:
df_final.show()

22/03/12 00:18:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:29 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:29 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:18:31 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+
|stars|business_id|user_id|
+-----+-----------+-------+
|  5.0|       3724|      1|
|  4.0|        257|      2|
|  4.0|        656|      2|
|  3.0|        758|      2|
|  4.0|       1281|      2|
|  4.0|       1370|      2|
|  4.0|       1389|      2|
|  5.0|       1724|      2|
|  4.0|       1977|      2|
|  5.0|       2099|      2|
|  4.0|       2319|      2|
|  5.0|       2571|      2|
|  4.0|       2618|      2|
|  5.0|       2902|      2|
|  4.0|       3169|      2|
|  5.0|       3558|      2|
|  5.0|       3745|      2|
|  4.0|       3747|      2|
|  5.0|       3756|      2|
|  4.0|       3999|      2|
+-----+-----------+-------+
only showing top 20 rows



                                                                                

##### Business table consisting only of business_id1 (in integer), name, city to join later with Predictions

In [24]:
business = df1.drop('stars','user_id','user_id1','business_id')

In [25]:
business = business.dropDuplicates()

## ALS

In [39]:
training, test = df_final.randomSplit([0.80,0.20])

In [40]:
ALSExplicit = ALS( implicitPrefs=False, userCol="user_id", itemCol="business_id", ratingCol="stars",
          coldStartStrategy="drop")

defaultModel = ALSExplicit.fit(training)

22/03/12 00:53:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:04 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

In [41]:
paramMapExplicit = ParamGridBuilder() \
                    .addGrid(ALSExplicit.numUserBlocks, [8,12]) \
                    .addGrid(ALSExplicit.numItemBlocks, [8,12]) \
                    .addGrid(ALSExplicit.rank, [8,12]) \
                    .addGrid(ALSExplicit.maxIter, [5,10]) \
                    .addGrid(ALSExplicit.regParam, [0.1,1]) \
                    .addGrid(ALSExplicit.alpha, [1,2]) \
                    .build()

evaluatorR = RegressionEvaluator(metricName="rmse", labelCol="stars")

In [42]:
CVALSExplicit = CrossValidator(estimator=ALSExplicit,
                            estimatorParamMaps=paramMapExplicit,
                            evaluator=evaluatorR,
                           numFolds=5)


In [None]:
CVModelEXplicit = CVALSExplicit.fit(training)


22/03/12 00:53:16 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:16 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:16 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:53:16 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 00:57:35 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

In [None]:
predictions = CVModelEXplicit.bestModel.transform(test)
predictions.show(5)

22/03/12 01:16:43 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:43 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:44 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:44 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:44 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+----------+
|stars|business_id|user_id|prediction|
+-----+-----------+-------+----------+
|  1.0|          1|  33385| 1.0111362|
|  1.0|          3|   5365|  1.873271|
|  1.0|          3|  23723| 2.2786682|
|  1.0|          3|  75980| 0.8214189|
|  1.0|          4|  14959| 3.5914705|
+-----+-----------+-------+----------+
only showing top 5 rows



                                                                                

In [59]:
CVModelEXplicit.bestModel.itemFactors.show(10, truncate = False)

+---+-------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                         |
+---+-------------------------------------------------------------------------------------------------------------------------------------------------+
|8  |[-0.07405181, 0.67826533, 0.37562102, 0.2925481, 0.4087645, 0.1958106, -0.370477, -0.10894719, 0.021125065, -0.19569635, -0.7276767, 0.0481807]  |
|16 |[0.4243349, -0.08812155, 0.15378655, 1.0680585, 0.80485785, -0.024866987, -0.04728186, 0.26328558, -0.3074677, 1.056447, -0.3869509, 0.619342]   |
|24 |[0.5507934, 0.62055606, 0.44106266, 0.18943879, 1.0267031, -0.106826454, 0.33354223, 0.5315679, -0.1605001, 0.35302967, 0.41552848, 1.1175851]   |
|32 |[0.40562364, 0.6678378, 0.2087543, 0.6126864, 0.98302084, -0.07462093, -0.5532373, 

In [None]:
predictions = business.join(predictions, predictions.business_id==business.business_id1, how='inner')

In [None]:
predictions = predictions.na.drop()
predictions.show(50, truncate = False)

22/03/12 01:16:49 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:49 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:49 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:16:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+--------------------------------+---------------+------------+-----+-----------+-------+----------+
|name                            |city           |business_id1|stars|business_id|user_id|prediction|
+--------------------------------+---------------+------------+-----+-----------+-------+----------+
|The Royce                       |Columbus       |1           |1.0  |1          |33385  |1.0111362 |
|KFC                             |Hilliard       |3           |1.0  |3          |5365   |1.873271  |
|KFC                             |Hilliard       |3           |1.0  |3          |23723  |2.2786682 |
|KFC                             |Hilliard       |3           |1.0  |3          |75980  |0.8214189 |
|ZenCha Tea Cafe                 |Bexley         |4           |1.0  |4          |14959  |3.5914705 |
|Happy Wok                       |Pickerington   |5           |1.0  |5          |38133  |3.576452  |
|Morone's Italian Villa          |Columbus       |6           |1.0  |6          |24690  |1.

                                                                                

##### RMSE

In [47]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square error is {}".format(rmse))

22/03/12 01:18:21 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:21 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:21 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

Root-mean-square error is 1.4857646771760125


                                                                                

##### R2

In [48]:
evaluator = RegressionEvaluator(metricName="r2", labelCol="stars",
                                predictionCol="prediction", throughOrigin=True)
r2 = evaluator.evaluate(predictions)

print("r2 error is {}".format(r2))

22/03/12 01:18:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:29 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:18:29 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

r2 error is 0.8598392717708458


                                                                                

##### Generate top 10 business recommendations for each user


In [49]:
userRecs = CVModelEXplicit.bestModel.recommendForAllUsers(10)

In [50]:
userRecs.show(5)

22/03/12 01:23:26 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 14.0 in stage 65151.0 (TID 162155) (bigdataprojectfinal-w-0.us-central1-b.c.red-formula-339716.internal executor 15): java.lang.StackOverflowError
	at java.io.ObjectInputStream$BlockDataInputStream.peekByte(ObjectInputStream.java:3109)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1837)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2074)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1657)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2119)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1657)
	at java.io.ObjectInputStream.defau

Container id: container_1647044044622_0002_01_000021
Exit code: 50

[2022-03-12 01:23:27.728]Container exited with a non-zero exit code 50. Error file: prelaunch.err.
Last 4096 bytes of prelaunch.err :
Last 4096 bytes of stderr :
(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:52



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|     31|[{2217, 2.5845842...|
|     34|[{2217, 3.1959221...|
|     53|[{1337, 5.5929847...|
|     65|[{3988, 5.333324}...|
|     78|[{3231, 6.13207},...|
+-------+--------------------+
only showing top 5 rows



                                                                                

##### Generate top 10 user recommendations for each business

In [None]:
BusinessRecs = CVModelEXplicit.bestModel.recommendForAllItems(10)

In [51]:
BusinessRecs.show(5)



+-----------+--------------------+
|business_id|     recommendations|
+-----------+--------------------+
|         28|[{24175, 5.60053}...|
|         31|[{28777, 5.587188...|
|         34|[{83083, 5.780057...|
|         53|[{73340, 5.782814...|
|         65|[{81733, 6.457925...|
+-----------+--------------------+
only showing top 5 rows



                                                                                

##### Generate top 10 business recommendations for a specified set of users


In [56]:
users = df_final.select( CVModelEXplicit.bestModel.getUserCol()).distinct().limit(3)
userSubsetRecs = CVModelEXplicit.bestModel.recommendForUserSubset(users, 10)

userSubsetRecs.show(10, truncate=False)

22/03/12 01:30:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:30:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:30:24 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:30:24 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:30:26 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                            |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{2791, 5.7482758}, {2217, 5.485434}, {1029, 5.234603}, {627, 5.197041}, {2379, 5.079887}, {1115, 5.051845}, {129, 5.0370674}, {3370, 4.9783773}, {3549, 4.976562}, {2790, 4.96323}]       |
|3      |[{3988, 7.005713}, {2217, 6.525013}, {1953, 6.3541136}, {2859, 6.2487063}, {2977, 6.1182995}, {2036, 6.0590153}, {3231, 6.049763}, {1423, 6.0278826}, {4332, 6.0137014}, {3019, 5.9585557}]|
|2      |[

                                                                                

##### Generate top 10 user recommendations for a specified set of business


In [57]:
businesses = df_final.select(CVModelEXplicit.bestModel.getItemCol()).distinct().limit(3)
businessSubSetRecs = CVModelEXplicit.bestModel.recommendForItemSubset(businesses, 10)

businessSubSetRecs.show(10, truncate=False)

22/03/12 01:31:21 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:31:21 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:31:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:31:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/03/12 01:31:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|business_id|recommendations                                                                                                                                                                                   |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|257        |[{57986, 5.6981764}, {47788, 5.5353694}, {76830, 5.46328}, {38950, 5.4226103}, {36548, 5.411033}, {14080, 5.407138}, {15761, 5.4038715}, {1760, 5.3847013}, {80674, 5.3521786}, {7406, 5.3471394}]|
|3724       |[{67688, 5.986271}, {16075, 5.819498}, {64902, 5.6489186}, {51840, 5.6404643}, {47784, 5.6356544}, {39797, 5.6146955}, {78118, 5.610285}, {22799, 5.586

