# **IV. Yelp Review Recommendation System - Alternating Least Square algorithm**

In [None]:
# We found that business dataset has some columns which has missing values and do not add any value to our model

yelp_business = yelp_business.select('business_id','name', 'stars',
                                 'review_count', 'attributes',
                                 'categories', 'city').withColumnRenamed('stars', 'stars_restaurant')

yelp_business = yelp_business.filter((yelp_business['city'] == 'Philadelphia') & (yelp_business.categories.contains('Restaurants'))).drop('city')

In [None]:
# We filter the business dataset and apply inner join it with review dataset to perform the factorization algorithm
yelp_restaurant_review = yelp_review.select('*')
yelp_restaurant_review = yelp_restaurant_review.join(yelp_business, on='business_id', how='inner')
yelp_restaurant_review.select(['business_id', 'user_id', 'stars']).show(5)

In [None]:
# Plotting a histogram of frequency of star ratings of restaurants from review dataset
%matplotlib inline
reviews = yelp_restaurant_review.select('stars').collect()
review_list = [reviews[i][0] for i in range(len(reviews))]
plt.hist(review_list, bins=[0.5,1.5,2.5,3.5,4.5,5.5], alpha=0.5,
         histtype='stepfilled', color='pink',
         edgecolor='none')
plt.ylabel('Frequency')
plt.xlabel('Rating')
plt.style.use('seaborn-white')

In [None]:
# Plotting a histogram of frequency of different star rating restaurants from the business dataset
restaurant_reviews = yelp_business.select('stars_restaurant').collect()
restaurant_reviews_list = [restaurant_reviews[i][0] for i in range(len(restaurant_reviews))]
plt.hist(restaurant_reviews_list, bins=[0.5,1.5,2.5,3.5,4.5,5.5], alpha=0.5,
         histtype='stepfilled', color='purple',
         edgecolor='none')
plt.ylabel('Frequency')
plt.xlabel('Rating')
plt.style.use('seaborn-white')

In [None]:
# Defining the categories of restaurants
restaurant_categories = yelp_business.select('categories').collect()
restaurant_categories_list = [restaurant_categories[i][0] for i in range(len(restaurant_categories))]
print(restaurant_categories_list)

In [None]:
# Define words from review
word = " ".join(review for review in restaurant_categories_list)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# elimiminating some common words
text = word.replace('Restaurants', "")
text = word.replace('bars', "")
text = word.replace('New', "")
text = word.replace('Food', "")
text = word.replace('Planning', "")
text = word.replace('Arts', "")
text = word.replace('Entertainment', "")
# Generate a word cloud image
wordcloud = WordCloud(background_color="white").generate(text)
plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

We find the words FOOD, AMERICAN, BREAKFAST and BRUNCH are maximum used words in the reviews

**BUILDING THE RECOMMENDER SYSTEM**


In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType

In [None]:
# Converting and transforming data into dataframe

indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(yelp_restaurant_review).transform(yelp_restaurant_review)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index'])

In [None]:
transformed.show(5)

In [None]:
# Split the data into train and test data sets in 80:20 ratio
(train, test) = transformed.randomSplit([0.8, 0.2])

In [None]:
from heapq import nlargest
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

In [None]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [None]:
# Building the system on ALS algorithm

als = ALS(maxIter=5,
        regParam=0.09,
        rank=25,
        userCol="user_id_index",
        itemCol="business_id_index",
        ratingCol="stars",
        coldStartStrategy="drop",
        nonnegative=True)

recommender =als.fit(train)

To evaluate the recommender model, we use the metric Root Mean Square Error

In [None]:
evaluator= RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions= recommender.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RESULTS OF RECOMMENDER SYSTEM

Checking out the results of the system for a random user_id= 952

In [None]:
test = recommender.recommendForAllUsers(20).filter(col('user_id_index')==952).select("recommendations").collect()
Restaurant_recm = []
for item in test[0][0]:
    Restaurant_recm.append(item.business_id_index)
schema = StructType([StructField("business_id_index",IntegerType(),True)])
restaurants = spark.createDataFrame(Restaurant_recm,IntegerType()).toDF("business_id_index")
transformed\
.select(['business_id', 'user_id', 'stars', 'categories'])\
.filter(col('user_id_index')==7313)\
.show()
restaurants\
.join(transformed, on = 'business_id_index', how = 'inner')\
.select(['business_id', 'stars', 'categories', 'name'])\
.drop_duplicates(subset=['name'])\
.show()