#### Yelp Recommender 

In [1]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder.appName('Yelp Restaurant Recommender').getOrCreate()
sc = spark.sparkContext 

In [2]:
# read in yelp datasets 
dfBusiness = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/business.snappy.parquet')
dfReview = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/review.snappy.parquet')
dfUser = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/user.snappy.parquet') 

# read in Daisuke's sentiment analysis model output 
dfSentiment = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/aspect_sentiment.snappy.parquet')


                                                                                

In [3]:
from pyspark.sql import functions as F 

# create partition function from professor 
def displayPartitions(df): 
    num = df.rdd.getNumPartitions() 
    df.withColumn('partitionId', F.spark_partition_id()).groupby('partitionId').count().orderBy(F.asc('count')).show(num)  

In [4]:
from pyspark.sql.functions import col, desc, when, regexp_replace, regexp_extract, explode 
from pyspark.sql.types import StringType, MapType, IntegerType 
from ast import literal_eval 

# clean business data (cont.) 

# drop any unnecessary fields 
dfBusiness = dfBusiness.drop('AcceptsInsurance', 'HairSpecializesIn')

# for each newly created column show the top distinct values and make necessary adjustments 
# for colName in dfBusiness.columns[12:]: 
    # dfBusiness.groupby(colName).count().withColumnRenamed('count', 'businesses').sort(col('businesses').desc()).show(25) 


# convert different string types to same value 
for column in ['AgesAllowed', 'Alcohol', 'BYOBCorkage', 'NoiseLevel', 'RestaurantsAttire', 'Smoking', 'WiFi']: 
    dfBusiness = dfBusiness.withColumn(column, regexp_replace(column, "u'", "'")) 

# convert any None or null to unknown 
dfBusiness = dfBusiness.replace('None', None) 
dfBusiness = dfBusiness.na.fill('unknown') 

# unnest further nested columns 
dfBusinessNestedCols = ['Ambience', 'BestNights', 'BusinessParking', 'DietaryRestrictions', 'GoodForMeal', 'Music'] 
for column in dfBusinessNestedCols: 
    dfBusiness = dfBusiness.withColumn('clean{}'.format(column), regexp_replace(column, "u'", "'")) 
    dfBusiness = dfBusiness.withColumn('clean{}'.format(column), regexp_replace(column, 'unknown', "{'unknown': True}")) 
    dfBusiness = dfBusiness.withColumn(column, F.udf(literal_eval, 'map<string,string>')('clean{}'.format(column))) 
    dfBusiness = dfBusiness.drop('clean{}'.format(column)) 
    curKeys = (dfBusiness.select(explode(column)).select('key').distinct().rdd.flatMap(lambda x: x).collect()) 
    curExpressions = [col(column).getItem(k).alias('{}_{}'.format(column, k)) for k in curKeys] 
    dfBusiness = dfBusiness.select('*', *curExpressions).drop(column) 

# create open/close for each time 
dfBusinessHoursCols = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] 
for column in dfBusinessHoursCols: 
    dfBusiness = dfBusiness.select('*', F.when(col(column) != 'unknown', regexp_extract(column, r'.*?(?=\:)', 0)).otherwise(None).alias('{}_start_str'.format(column)))
    dfBusiness = dfBusiness.select('*', F.when(col(column) != 'unknown', regexp_extract(column, r'(?<=\-).+(?=\:)', 0)).otherwise(None).alias('{}_end_str'.format(column)))
    dfBusiness = dfBusiness.withColumn('{}_start'.format(column), col('{}_start_str'.format(column)).cast(IntegerType())) 
    dfBusiness = dfBusiness.withColumn('{}_end'.format(column), col('{}_end_str'.format(column)).cast(IntegerType()))
    dfBusiness = dfBusiness.drop(column, '{}_start_str'.format(column), '{}_end_str'.format(column)) 

# dfBusiness.printSchema() 


                                                                                

In [5]:
# clean reviews data 

from pyspark.sql.functions import col, to_date, to_timestamp 
from pyspark.sql.functions import year, month, dayofweek, hour 
from pyspark.sql.functions import length 

# convert date field to actual date 
dfReview = dfReview.withColumn('datetime', to_timestamp(col('date'), 'yyyy-MM-dd HH:mm:ss')) 
dfReview = dfReview.withColumn('date', to_date(col('datetime'))) 

# create date-related fields 
dfReview = dfReview.withColumn('year', year(col('date'))).withColumn('month', month(col('date'))) 
dfReview = dfReview.withColumn('day', dayofweek(col('date'))) 
dfReview = dfReview.withColumn('hour', hour(col('datetime'))) 

# create length of review field 
dfReview = dfReview.withColumn('text_length', length(col('text'))) 

# dfReview.printSchema() 

In [6]:
from pyspark.sql.functions import size, split 

# clean user data 

dfUser = dfUser.withColumn('yelping_since', to_timestamp(col('yelping_since'), 'yyyy-MM-dd HH:mm:ss')) 

dfUser = dfUser.replace('None', None) 
dfUser = dfUser.na.fill('unknown') 
dfUser = dfUser.withColumn('n_friends', F.when(col('friends') == 'unknown', 0).otherwise(size(split(col('friends'), r'\,'))))
dfUser = dfUser.drop('name', 'friends') 

# dfUser.show(5) 

In [None]:
# clean sentiment data 

# confirm counts are similar between sentiment and review 
dfSentiment.count() # 3773770
dfReview.count() # 3773770 
# good! 

# dfSentiment.show(5, truncate = False) # already good thanks to Daisuke! 

In [None]:
# repartition if necessary 

print('Default parallelism:', sc.defaultParallelism, '\n')

print('Number of dfBusiness partitions:', dfBusiness.rdd.getNumPartitions()) 
print('Number of dfReview partitions:', dfReview.rdd.getNumPartitions()) 
print('Number of dfSentiment partitions:', dfSentiment.rdd.getNumPartitions()) 
print('Number of dfUser partitions:', dfUser.rdd.getNumPartitions()) 

# good for now, but will want to repartition the final table 

In [7]:
import numpy as np 

# rename the columns in each table and join together 

dfBusinessColumns = [str(np.where(col[-3:] != '_id', 'b_{}'.format(col), col)) for col in dfBusiness.columns] 
dfBusiness = dfBusiness.toDF(*dfBusinessColumns) 
dfReviewColumns = [str(np.where(col[-3:] != '_id', 'r_{}'.format(col), col)) for col in dfReview.columns]
dfReview = dfReview.toDF(*dfReviewColumns) 
dfSentimentColumns = [str(np.where(col[-3:] != '_id', 's_{}'.format(col), col)) for col in dfSentiment.columns]
dfSentiment = dfSentiment.toDF(*dfSentimentColumns) 
dfUserColumns = [str(np.where(col[-3:] != '_id', 'u_{}'.format(col), col)) for col in dfUser.columns]
dfUser = dfUser.toDF(*dfUserColumns) 

dfFinal = dfReview.join(dfBusiness, 'business_id', 'left').join(dfSentiment, 'review_id', 'left').join(dfUser, 'user_id', 'left') 

# dfFinal.printSchema() 

In [8]:
# repartition the dataset 

print('Default parallelism:', sc.defaultParallelism, '\n')
print('Number of dfFinal partitions:', dfFinal.rdd.getNumPartitions(), '\n') 

dfFinal = dfFinal.repartition(200)
# displayPartitions(dfFinal) 

# consider creating additional features at a later point 

Default parallelism: 4 





Number of dfFinal partitions: 85 



In [9]:
del dfBusiness, dfReview, dfSentiment, dfUser 

In [None]:
from pyspark.sql.functions import collect_set 
from pyspark.ml.fpm import FPGrowth 

# mine frequent user-business combinations 

dfAssocUb = dfFinal.select('b_name', 'user_id') 
dfAssocUb = dfAssocUb.groupBy('user_id').agg(collect_set('b_name').alias('businesses')) 
# dfAssocUb.show(5) 

# count users, businesses, and ratio 
print('# users:', dfAssocUb.count())
print('# businesses:', dfFinal.select('b_name').distinct().count())
print('# reviews:', dfFinal.count()) 
# seems like the 0.0001 should be a good support level as 0.001 won't return too many records 

# fit with higher min support to minimize size of output and get strongest associations 
fpgUb = FPGrowth(itemsCol = 'businesses', minSupport = 0.0001, minConfidence = 0) 
modelFpgUb = fpgUb.fit(dfAssocUb) 
dfFreqItemsets = modelFpgUb.freqItemsets.withColumnRenamed('items', 'businesses') 
print('# itemsets:', dfFreqItemsets.count()) 
dfFreqItemsets.filter('size(businesses) > 1').orderBy('freq', ascending = False).show(truncate = False) 

# identify any association rules 
dfAssocUbRules = modelFpgUb.associationRules.orderBy('confidence', ascending = False) 
dfAssocUbRules.show(truncate = False) 

# summarize the consequents 
dfAssocUbRecommendations = modelFpgUb.transform(dfAssocUb).filter(F.size(F.col('prediction')) >= 1) 
dfAssocUbRecommendations = dfAssocUbRecommendations.select('user_id', 'businesses', explode(dfAssocUbRecommendations.prediction)).withColumnRenamed('col', 'Recommendation') 
dfAssocUbRecommendations.show() 

# takeaways: 
# clear geographic tendencies (Pat's and Geno's in Philly), (Biscuit Love and Hattie B's in Nashville) 
# others are clearly fast food/global restaurants 

In [10]:
from pyspark.ml.feature import StringIndexer
from datetime import datetime 
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator 

# create indexed dataframe 
dfALS = dfFinal.select('user_id', 'b_name', 'r_stars') 
for column in ['user_id', 'b_name']: 
    indexer = StringIndexer(inputCol = column, outputCol = 'idx_{}'.format(column)) 
    dfALS = indexer.fit(dfALS).transform(dfALS) 

# build a simple recommendation engine and tune the parameters effectively 
def modelALS(data, user, item, rating, sampleProp = 0.05, seedSamp = 1234, splitProp = 0.8, seedSplit = 5678, iters = 10, rank = 10, reg = 0.01, coldStart = 'drop', nonNeg = True): 
    samp = data.sample(sampleProp, seed = seedSamp) 
    train, test = samp.randomSplit([splitProp, (1-splitProp)]) 
    mod = ALS(maxIter = iters, regParam = reg, rank = rank, userCol = user, itemCol = item, ratingCol = rating, coldStartStrategy = coldStart, nonnegative = nonNeg) 
    modFit = mod.fit(train) 
    modPreds = modFit.transform(test) 
    evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'r_stars') 
    rmse = evaluator.evaluate(modPreds) 
    # evaluator.evaluate(modPreds, {evaluator.metricName: "accuracy"})
    return modFit, modPreds, rmse 

# run one iteration 
# start = datetime.now() 
# fit1, preds1, rmse1 = modelALS(dfALS, 'idx_user_id', 'idx_b_name', 'r_stars') 
# print('Model fit in:', datetime.now()-start) 

                                                                                

In [None]:
# evaluate initial fit 
# print('RMSE:', rmse1) 

In [None]:
import itertools 

# create a grid searching fit 
iters = [5, 10, 20] 
regs = [0.01, 0.1] 
ranks = [5, 10, 20] 
grid = [el for el in itertools.product(iters, regs, ranks)]

# iteratively fit the model over the function below 
# will train on very small subset of the data (2%) to improve runtimes 
lOutput = [] 
for i, rg, rk in grid: 
    start = datetime.now() 
    print('Iteration with {} iterations, {} regularization parameter, and {} rank started at {}'.format(i, rg, rk, start)) 
    curFit, curPreds, curRMSE = modelALS(dfALS, 'idx_user_id', 'idx_b_name', 'r_stars', sampleProp = 0.02, iters = i, rank = rk, reg = rg) 
    lOutput.append((curFit, curPreds, i, rk, rg, curRMSE)) 
    print('Iteration completed in:', datetime.now()-start) 
    print('RMSE:', curRMSE) 

Iteration with 5 iterations, 0.01 regularization parameter, and 5 rank started at 2023-05-18 04:17:01.223670


23/05/18 04:17:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:17:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:17:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:18:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:24.395951
RMSE: 4.771131906795268
Iteration with 5 iterations, 0.01 regularization parameter, and 10 rank started at 2023-05-18 04:20:25.619723


23/05/18 04:20:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:20:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:21:19 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:21:38 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:21:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:22:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:22:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:22:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:22:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:16.165201
RMSE: 2.986872199813267
Iteration with 5 iterations, 0.01 regularization parameter, and 20 rank started at 2023-05-18 04:23:41.785005


23/05/18 04:24:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:24:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:24:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:25:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:25:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:25:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:25:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:26:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:26:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:04:17.139216
RMSE: 2.4237914716253877
Iteration with 5 iterations, 0.1 regularization parameter, and 5 rank started at 2023-05-18 04:27:58.924319


23/05/18 04:28:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:28:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:28:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:38 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:29:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:07.473448
RMSE: 2.3900637519277295
Iteration with 5 iterations, 0.1 regularization parameter, and 10 rank started at 2023-05-18 04:31:06.397881


23/05/18 04:31:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:31:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:32:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:18.564725
RMSE: 2.314528139754796
Iteration with 5 iterations, 0.1 regularization parameter, and 20 rank started at 2023-05-18 04:34:24.962695


23/05/18 04:34:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:35:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:35:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:35:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:35:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:36:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:36:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:36:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:36:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:16.462479
RMSE: 2.3103115484588073
Iteration with 10 iterations, 0.01 regularization parameter, and 5 rank started at 2023-05-18 04:37:41.425253


23/05/18 04:38:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:38:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:38:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:38:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:38:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:39:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:39:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:39:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:39:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:42.691843
RMSE: 3.5876645090493025
Iteration with 10 iterations, 0.01 regularization parameter, and 10 rank started at 2023-05-18 04:41:24.117176


23/05/18 04:41:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:42:00 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:42:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:42:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:42:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:43:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:43:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:43:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:43:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:51.648325
RMSE: 2.7376874848211012
Iteration with 10 iterations, 0.01 regularization parameter, and 20 rank started at 2023-05-18 04:45:15.765618


23/05/18 04:45:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:45:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:46:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:46:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:46:38 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:46:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:47:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:47:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:47:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:45.251473
RMSE: 2.375280468596678
Iteration with 10 iterations, 0.1 regularization parameter, and 5 rank started at 2023-05-18 04:49:01.017181


23/05/18 04:49:37 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:49:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:50:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:04:53.328286
RMSE: 2.2215463964345123
Iteration with 10 iterations, 0.1 regularization parameter, and 20 rank started at 2023-05-18 04:57:38.062252


23/05/18 04:58:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:58:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:58:37 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:58:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:59:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:59:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:59:25 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:59:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 04:59:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:03:44.742503
RMSE: 2.189806000652578
Iteration with 20 iterations, 0.01 regularization parameter, and 5 rank started at 2023-05-18 05:01:22.804843


23/05/18 05:01:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:02:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:02:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:31 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:03:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:07:04.969052
RMSE: 2.8005328616039136
Iteration with 20 iterations, 0.01 regularization parameter, and 10 rank started at 2023-05-18 05:08:27.773984


23/05/18 05:09:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:09:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:09:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:10:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:07:01.379645
RMSE: 2.4848457217772286
Iteration with 20 iterations, 0.01 regularization parameter, and 20 rank started at 2023-05-18 05:15:29.153737


23/05/18 05:16:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:16:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:16:31 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:16:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:16:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:17:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:17:18 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:17:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:17:25 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:04:56.580999
RMSE: 2.1717087398943455
Iteration with 20 iterations, 0.1 regularization parameter, and 5 rank started at 2023-05-18 05:20:25.734824


23/05/18 05:20:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:21:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:21:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:21:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:21:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:22:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:22:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:22:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:22:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:04:57.821158
RMSE: 2.0988841828471374
Iteration with 20 iterations, 0.1 regularization parameter, and 10 rank started at 2023-05-18 05:25:23.556076


23/05/18 05:26:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:26:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:26:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:26:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:26:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:27:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:27:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:27:19 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:27:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:05:08.038727
RMSE: 2.102552636495326
Iteration with 20 iterations, 0.1 regularization parameter, and 20 rank started at 2023-05-18 05:30:31.594900


23/05/18 05:31:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:31:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:31:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:31:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:31:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:32:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:32:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:32:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 05:32:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

Iteration completed in: 0:04:57.928769
RMSE: 2.1620951662303542


                                                                                

In [13]:
# create pyspark dataframe 
lOutputMetrics = [t[2:] for t in lOutput] 
dfOutput = spark.createDataFrame(data = lOutputMetrics, schema = ['Iterations', 'Rank', 'Reg', 'RMSE']) 
dfOutput = dfOutput.orderBy('RMSE', ascending = False) 
dfOutput.show() 

+----------+----+----+------------------+
|Iterations|Rank| Reg|              RMSE|
+----------+----+----+------------------+
|         5|   5|0.01| 4.771131906795268|
|        10|   5|0.01|3.5876645090493025|
|         5|  10|0.01| 2.986872199813267|
|        20|   5|0.01|2.8005328616039136|
|        10|  10|0.01|2.7376874848211012|
|        20|  10|0.01|2.4848457217772286|
|         5|  20|0.01|2.4237914716253877|
|         5|   5| 0.1|2.3900637519277295|
|        10|  20|0.01| 2.375280468596678|
|         5|  10| 0.1| 2.314528139754796|
|         5|  20| 0.1|2.3103115484588073|
|        10|   5| 0.1|2.2504165539498673|
|        10|  10| 0.1|2.2215463964345123|
|        10|  20| 0.1| 2.189806000652578|
|        20|  20|0.01|2.1717087398943455|
|        20|  20| 0.1|2.1620951662303542|
|        20|  10| 0.1| 2.102552636495326|
|        20|   5| 0.1|2.0988841828471374|
+----------+----+----+------------------+



In [14]:
# fit full model 

modFit, preds, RMSE = modelALS(dfALS, 'idx_user_id', 'idx_b_name', 'r_stars', sampleProp = 0.5, iters = 20, rank = 5, reg = 0.1) 

23/05/18 06:04:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:04:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:05:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:05:31 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:05:38 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:06:00 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:06:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:06:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 54.8 MiB
23/05/18 06:06:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task 

In [None]:
# create functions to make predictions best model 

def getRecsForUser(model, subset, n = 3): 
    recs = model.recommendForUserSubset(subset, n) 
    return recs 

def getRecsForBusiness(model, subset, n = 3): 
    recs = model.recommendForItemSubset(subset, n) 
    return recs 

In [15]:
print('RMSE:', RMSE) 

RMSE: 1.1096681724421948


In [None]:
# implement a random forest classifier 
# selected because feature selection is generally not very important 