In [1]:
import pyspark.sql.functions as func
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
json_key = os.getenv('auth_json')
spark = (SparkSession.builder
                     .appName('Recommendations')
                     .config('spark.jars', 'https://storage.googleapis.com/spark-lib/bigquery/spark-3.4-bigquery-0.34.0.jar')
                     .config('credentialsFile', f"../keys/{json_key}")
                     .config('parentProject', f'{os.getenv("project")}')
                     .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                     .config("spark.kryoserializer.buffer.max", "2047m")
                     .getOrCreate()
        )
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
# Set Hadoop configurations to use the service account JSON key
# sc = spark.sparkContext
# sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
# sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
# sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", f"../keys/{json_key}")

24/03/23 07:37:36 WARN Utils: Your hostname, codespaces-b70f2a resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/03/23 07:37:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/23 07:37:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# files in bucket
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_business.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_checkin.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_review.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_tip.json
# gs://yelp_data_6740/yelp_data/yelp_academic_dataset_user.json

In [4]:
def load_json_files(bucket_path):
    return (spark.read.json(bucket_path))

In [5]:
def preprocessing_business(spark,
                           city_name:str ='Philadelphia',
                           category:str = 'restaurant',
                           min_review_count:int =10):
    spark.conf.set("viewsEnabled","true")
    spark.conf.set("materializationDataset","yelp_data")


    sql_statement = f'''
        SELECT business_id, categories, name, review_count, stars AS business_stars
        FROM `{os.getenv("project")}.{os.getenv("dataset")}.business`
        WHERE is_open = 1
        AND LOWER(city) = '{city_name.lower()}'
        AND LOWER(categories) LIKE '%{category.lower()}%'
        AND review_count >= {min_review_count}
        ;
    '''
    # select columns to be used, rename to avoid name collision
    city_business = (
        spark.read
             .format('bigquery')
             .option('query', sql_statement)
             .option("materializationExpirationTimeInMinutes", 10)
             .load()
    )
    
    string_indexer = StringIndexer(inputCol='business_id', outputCol='business_id_encode')
    model = string_indexer.fit(city_business)
    
    city_business_num_id = model.transform(city_business) \
                                .withColumn(
                                    'business_id_encode',
                                    func.col('business_id_encode').cast(IntegerType())
                                )

    return city_business_num_id

In [6]:
# preprocessing_business(spark=spark, category='restaurant').show(5)

In [7]:
def preprocess_review(spark,
                      min_review_count:int = 10,
                      cutoff_date=None,
                      ):
    # https://github.com/GoogleCloudDataproc/spark-bigquery-connector/tree/master
    spark.conf.set("viewsEnabled","true")
    spark.conf.set("materializationDataset","yelp_data")
    
    sql_statement = f'''
        SELECT r.user_id, r.business_id, r.date, r.review_id, r.stars AS user_stars
        FROM `{os.getenv("project")}.{os.getenv("dataset")}.reviews` r
        INNER JOIN (
            SELECT user_id, COUNT(review_id) AS user_review_count
            FROM `{os.getenv("project")}.{os.getenv("dataset")}.reviews`
            GROUP BY user_id
            HAVING user_review_count >= {min_review_count}
        ) rc
        ON r.user_id = rc.user_id
        ;
    ''' 
    user_reviews = (
        spark.read
             .format('bigquery')
             .option('query', sql_statement)
             .option("materializationExpirationTimeInMinutes", 10)
             .load()
    )
    string_indexer = StringIndexer(inputCol='user_id', outputCol='user_id_encode')
    model = string_indexer.fit(user_reviews)

    user_reviews_num_id = model.transform(user_reviews) \
                               .withColumn(
                                   'user_id_encode',
                                   func.col('user_id_encode').cast(IntegerType())
                               )
    return user_reviews_num_id

In [8]:
# preprocess_review(spark=spark).show(5)

In [9]:
reviews = preprocess_review(spark=spark)
businesses = preprocessing_business(spark=spark)

business_user_review = reviews.join(businesses,
                                    on='business_id',
                                    how='inner')

24/03/23 07:37:49 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [11]:
(train, test) = business_user_review.randomSplit([0.8, 0.2], seed=123)

# make ALS model
als = ALS(userCol='user_id_encode',
          itemCol='business_id_encode',
          ratingCol='user_stars',
          coldStartStrategy='drop',
          nonnegative=True,
          rank=14,
          regParam=0.19
          )

evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='user_stars',
    predictionCol='prediction'
)

In [12]:
als_params = ParamGridBuilder().addGrid(als.rank, [12,13,14]) \
                               .addGrid(als.regParam, [0.17,0.18,0.19]) \
                               .build()        
cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=als_params,
        evaluator=evaluator
    )


In [13]:
# model = cv.fit(train)
# best rank: 14
# best regParam: 0.19

In [14]:
model = als.fit(train)

24/03/23 07:38:01 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
24/03/23 07:38:06 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
24/03/23 07:38:10 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:12 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:14 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:15 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:16 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:17 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:18 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/03/23 07:38:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/03/23 

In [15]:
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)
print(rmse)

24/03/23 07:38:31 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
24/03/23 07:38:31 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:31 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:35 WARN DAGScheduler: Broadcasting large task binary with size 5.7 MiB
24/03/23 07:38:38 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:38 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB


1.135459922938278


                                                                                

In [16]:
user_rec = model.recommendForAllUsers(2)
# https://github.com/apache/spark/blob/master/examples/src/main/python/ml/als_example.py

In [17]:
user_rec.show()

24/03/23 07:38:39 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/03/23 07:38:47 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB


+--------------+--------------------+
|user_id_encode|     recommendations|
+--------------+--------------------+
|            12|[{1873, 5.177342}...|
|            13|[{2464, 5.362191}...|
|            22|[{1670, 6.202652}...|
|            26|[{1873, 4.588222}...|
|            34|[{1873, 4.77135},...|
|            44|[{1873, 5.8244104...|
|            52|[{829, 4.8981504}...|
|            65|[{1823, 1.1047455...|
|            81|[{1873, 5.3039203...|
|            91|[{1670, 4.8955626...|
|           101|[{2005, 5.2678757...|
|           103|[{1873, 5.266436}...|
|           132|[{1873, 4.647034}...|
|           140|[{1823, 4.6282005...|
|           146|[{1433, 5.57479},...|
|           148|[{611, 4.0818186}...|
|           190|[{1823, 4.743745}...|
|           192|[{2717, 4.6779294...|
|           209|[{1873, 4.4294596...|
|           211|[{2005, 4.975859}...|
+--------------+--------------------+
only showing top 20 rows



                                                                                