In [1]:
import pyspark.sql.functions as func
import 
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from dotenv import load_dotenv
import os

In [2]:
spark = (SparkSession.builder
                     .appName('Recommendations')
                     .config('spark.jars', 'https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar')
                     .getOrCreate()
         )
# spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
# Set Hadoop configurations to use the service account JSON key
load_dotenv()
json_key = os.getenv('auth_json')
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", f"../keys/{json_key}")

24/03/18 03:13:22 WARN Utils: Your hostname, codespaces-b70f2a resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/03/18 03:13:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/18 03:13:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# files in bucket
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_business.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_checkin.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_review.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_tip.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_user.json

In [4]:
def load_json_files(bucket_path):
    return (spark.read.json(bucket_path))

In [5]:
df_business = load_json_files('gs://yelp_data_6740/yelp_data/yelp_academic_dataset_business.json')
df_business.show(5)

24/03/18 03:13:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{NULL, NULL, NULL...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                NULL|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{NULL, NULL, NULL...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|       Affton|{8:0-18:30, 0:0-0...|      1| 38.551126|  -90.335695|    

In [6]:
# df_checkin = load_json_files('gs://yelp_data_6740/yelp_data/yelp_academic_dataset_checkin.json')
# df_tip = load_json_files('gs://yelp_data_6740/yelp_data/yelp_academic_dataset_tip.json')
# df_user = load_json_files('gs://yelp_data_6740/yelp_data/yelp_academic_dataset_user.json')

In [7]:
df_review = load_json_files('gs://yelp_data_6740/yelp_data/yelp_academic_dataset_review.json')
df_review.show(5)

24/03/18 03:13:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|YjUWPpI6HXG530lwP...|   0|2014-02-05 20:30:30|    0|saUsX_uimxRlCVr67...|  3.0|Family diner. Had...|     0|8g_iMtfSiwikVnbP2...|
|kxX2SOes4o-D3ZQBk...|   1|2015-01-04 00:01:03|    0|AqPFMleE6RsU23_au...|  5.0|Wow!  Yummy, diff...|     1|_7bHUi9Uuf5__HHc_...|
|e4Vwtrqf-wpJfwesg...|   1|2017-01-14 20:54:15|    0|Sx8TMOWLNuJBWer-0...|  4.0|Cute inter

In [8]:
# df_business.groupBy('state') \
#            .agg(func.countDistinct('business_id').alias('unique_business')) \
#            .orderBy(func.col('unique_business').desc()) \
#            .show() 

In [9]:
def preprocessing_business(dataset,
                           city_name:str ='Philadelphia',
                           category:str = 'restaurant',
                           min_review_count:int =10):

    # filter businesses to a certain city,
    city_business = dataset.filter(
        (func.col('city') == city_name) & # select a certain city
        (func.col('review_count') >= min_review_count) & # have mininum review count
        (func.lower(func.col('categories')).contains(category)) & # business category
        (func.col('is_open')==1) # current open business
    )

    # select columns to be used, rename to avoid name collision
    city_business = city_business.select(
        'business_id', 'categories', 'name',
        'review_count', 'stars'
    ).withColumnRenamed(
        'stars', 'business_stars'
    )

    string_indexer = StringIndexer(inputCol='business_id', outputCol='business_id_encode')
    model = string_indexer.fit(city_business)
    
    city_business_num_id = model.transform(city_business) \
                                .withColumn(
                                    'business_id_encode',
                                    func.col('business_id_encode').cast(IntegerType())
                                )

    return city_business_num_id

In [10]:
preprocessing_business(dataset=df_business, category='restaurant').show(5)

                                                                                

+--------------------+--------------------+------------------+------------+--------------+------------------+
|         business_id|          categories|              name|review_count|business_stars|business_id_encode|
+--------------------+--------------------+------------------+------------+--------------+------------------+
|MTSW4McQd7CbVtyjq...|Restaurants, Food...|St Honore Pastries|          80|           4.0|              1111|
|MUTTqe8uqyMdBl186...|Sushi Bars, Resta...|          Tuna Bar|         245|           4.0|              1113|
|ROeacJQwBeh05Rqg7...| Korean, Restaurants|               BAP|         205|           4.5|              1324|
|aPNXGTDkf-4bjhyMB...|Eatertainment, Ar...|        Craft Hall|          65|           3.5|              1792|
|ppFCk9aQkM338Rgwp...|Restaurants, Auto...|              Wawa|          56|           3.0|              2577|
+--------------------+--------------------+------------------+------------+--------------+------------------+
only showi

In [11]:
def preprocess_review(dataset,
                      min_review_count:int = 10,
                      cutoff_date=None,
                      ):
    # filter out users with less than mininum review counts
    review_counts = dataset.groupBy('user_id') \
                           .agg(func.count('review_id').alias('user_review_count')) \
                           .filter(func.col('user_review_count') >= min_review_count)

    user_reviews = dataset.join(review_counts, on='user_id', how='inner')

    # select columns
    user_reviews = user_reviews.select(
        'user_id', 'business_id', 'date',
        'review_id', 'stars'
    )
    string_indexer = StringIndexer(inputCol='user_id', outputCol='user_id_encode')
    model = string_indexer.fit(user_reviews)

    user_reviews_num_id = model.transform(user_reviews) \
                               .withColumn(
                                   'user_id_encode',
                                   func.col('user_id_encode').cast(IntegerType())
                               )
    return user_reviews_num_id

In [12]:
# preprocess_review(dataset=df_review).show()

In [13]:
reviews = preprocess_review(dataset=df_review)
businesses = preprocessing_business(dataset=df_business)

business_user_review = reviews.join(businesses,
                                    on='business_id',
                                    how='inner')
business_user_review.show(5)

24/03/18 03:15:13 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
[Stage 25:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-------------------+--------------------+-----+--------------+--------------------+--------------------+------------+--------------+------------------+
|         business_id|             user_id|               date|           review_id|stars|user_id_encode|          categories|                name|review_count|business_stars|business_id_encode|
+--------------------+--------------------+-------------------+--------------------+-----+--------------+--------------------+--------------------+------------+--------------+------------------+
|D5V0Fawd6ODVgqCY8...|-073IXD_JkLK8SlRc...|2017-05-04 00:31:58|H5UqvEiyvlGlz05ya...|  5.0|         91717|Event Planning & ...|Loews Philadelphi...|         505|           3.5|               671|
|7lwe7n-Yc-V9E_HfL...|-43uAiZZ6wsGhaurB...|2010-04-20 05:10:47|qkSgpJ7fc_PyMxLX9...|  5.0|         91728|Restaurants, Pubs...|       Pub & Kitchen|         615|           3.5|               401|
|j-qtdD55OLfSqfsWu...|-43

                                                                                

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [15]:
(train, test) = business_user_review.randomSplit([0.8, 0.2], seed=123)

# make ALS model
als = ALS(userCol='user_id_encode',
          itemCol='business_id_encode',
          ratingCol='stars',
          coldStartStrategy='drop',
          nonnegative=True,
          rank=14,
          regParam=0.19
          )

evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='stars',
    predictionCol='prediction'
)

In [24]:
als_params = ParamGridBuilder().addGrid(als.rank, [12,13,14]) \
                               .addGrid(als.regParam, [0.17,0.18,0.19]) \
                               .build()        



cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=als_params,
        evaluator=evaluator
    )
# best rank: 14
# best regParam: 0.19

In [25]:
# model = cv.fit(train)

24/03/18 03:57:47 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:57:59 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:57:59 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:00 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:01 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:02 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:02 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:03 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:04 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:04 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:05 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 03:58:05 WARN DAGScheduler: Broadcasting larg

In [31]:
model = als.fit(train)

24/03/18 04:30:38 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:39 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:44 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:45 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:46 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:46 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:47 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:47 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:47 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:48 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:49 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:30:49 WARN DAGScheduler: Broadcasting larg

In [32]:
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)
print(rmse)

24/03/18 04:32:24 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:32:24 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/18 04:33:04 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB
24/03/18 04:33:11 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB


1.2137780666040103


                                                                                

In [33]:
user_rec = model.recommendForAllUsers(2)
# https://github.com/apache/spark/blob/master/examples/src/main/python/ml/als_example.py

24/03/18 04:38:29 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB
24/03/18 04:38:36 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB


+--------------+--------------------+
|user_id_encode|     recommendations|
+--------------+--------------------+
|            12|[{420, 5.388963},...|
|            13|[{1767, 5.890468}...|
|            22|[{842, 6.3923216}...|
|            26|[{496, 4.7561226}...|
|            34|[{1098, 5.0517874...|
+--------------+--------------------+
only showing top 5 rows



                                                                                