In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=a1cf943c1c6adf618a3754c652baf31ff8a8eadeb15e7e25720694e9645eaade
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pyspark.sql.functions as func
from pyspark.sql import Window
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, BooleanType, MapType, FloatType, ArrayType, DoubleType
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import rand
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from google.colab import files
from google.colab import drive

In [None]:
spark = SparkSession.builder \
    .appName("Restaurant Recommendation") \
    .getOrCreate()

spark.conf.set("spark.sql.legacy.style", "false")

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
schema = StructType([
    StructField("business_id", StringType()),
    StructField("name", StringType()),
    StructField("address", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("postal_code", StringType()),
    StructField("latitude", FloatType()),
    StructField("longitude", FloatType()),
    StructField("stars", FloatType()),
    StructField("review_count", IntegerType()),
    StructField("is_open", IntegerType()),
    StructField("attributes", MapType(StringType(), StringType())),
    StructField("categories", StringType()),
    StructField("hours", MapType(StringType(), StringType()))
])

In [None]:
business_df = spark.read.schema(schema).json('/content/drive/Shared with me/ISyE 6740 - Computational Analytics/Project/yelp_academic_dataset_business.json')

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/drive/Shared with me/ISyE 6740 - Computational Analytics/Project/yelp_academic_dataset_business.json.

In [None]:
reviews_df = spark.read.json('/content/drive/My Drive/1rBUldylKIYJm9jll5OSUCbz3duoT4Jfb/yelp_academic_dataset_review.json')

In [None]:
users_df = spark.read.json('/content/drive/My Drive/ISyE 6740 - Computational Analytics/Project/yelp_academic_dataset_user.json')

In [None]:
#Split out attributes into own fields
all_keys = business_df.selectExpr("explode(map_keys(attributes)) as keys").select("keys").distinct().collect()
select_expr = ["attributes." + key["keys"] + " as " + key["keys"] for key in all_keys]
business_df = business_df.selectExpr("*", *select_expr)
business_df = business_df.drop("attributes")

In [None]:
# Define a function to parse JSON strings
def parse_json(json_str):
    try:
        parsed_json = eval(json_str)
        return parsed_json if isinstance(parsed_json, dict) else {}
    except Exception:
        return {}

parse_json_udf = func.udf(parse_json, MapType(StringType(), BooleanType()))

parseColumns = ["Ambience","BusinessParking","BestNights","GoodForMeal","Music"]

for i in parseColumns:
  business_df = business_df.withColumn(i, parse_json_udf(func.col(i)))

  keys = business_df.selectExpr(f"explode(map_keys({i})) as keys").select("keys").distinct().collect()
  select_expr = [f"{i}." + key["keys"] + f" as {i}_" + key["keys"] for key in keys]
  business_df = business_df.selectExpr("*", *select_expr)
  business_df = business_df.drop(i)

business_df.show()

In [None]:
#Split for hours
hour_keys = business_df.selectExpr("explode(map_keys(hours)) as keys").select("keys").distinct().collect()
select_expr = ["hours." + key["keys"] + " as hours_" + key["keys"] for key in hour_keys]
business_df = business_df.selectExpr("*", *select_expr)
business_df = business_df.drop("hours")

In [None]:
#Generic numeric user_id and business_id to use later for modeling functions
user_window = Window.orderBy("user_id")
business_window = Window.orderBy("business_id")

users_df = users_df.withColumn("user_number", func.row_number().over(user_window)) #May need to do this only for PA
business_df = business_df.withColumn("business_number", func.row_number().over(business_window))

#Create temp views for SQL
business_df.createOrReplaceTempView("business")
reviews_df.createOrReplaceTempView("reviews")
users_df.createOrReplaceTempView("users")

In [None]:
#Data Filtering
city = 'philadelphia'

#Restaurants
restaurant_sql = f'''
        SELECT
            b.*
        FROM
            business b
        WHERE
            b.is_open=1 AND LOWER(b.categories) LIKE '%restaurant%' AND lower(b.city) = '{city}'
        ;
    '''

restaurants = spark.sql(restaurant_sql)
restaurants.createOrReplaceTempView("restaurants")


#Reviews for Restaurants
restaurant_reviews_sql = f'''
        SELECT
            rv.*, r.city, r.state
        FROM
            reviews rv
        INNER JOIN
            restaurants r
        ON rv.business_id = r.business_id
        ;
    '''

restaurant_reviews = spark.sql(restaurant_reviews_sql)
restaurant_reviews.createOrReplaceTempView("restaurant_reviews")

#Restaurant review volume
review_threshold=10
restaurant_review_volume_sql = f'''
        SELECT
           rv.business_id, count(*) as review_count
        FROM
            restaurant_reviews rv
        GROUP BY rv.business_id
        HAVING count(*)>={review_threshold}
        ;
    '''
restaurant_review_volume = spark.sql(restaurant_review_volume_sql)
restaurant_reviews.createOrReplaceTempView("restaurant_review_volume")

#Restaurant review volume by state
restaurant_review_volume_state_sql = f'''
        SELECT
           rv.state, count(*) as review_count
        FROM
            restaurant_reviews rv
        GROUP BY rv.state ORDER BY review_count DESC
        ;
    '''

restaurant_review_volume_state = spark.sql(restaurant_review_volume_state_sql)


#Users with at least 10 restaurant reviews
min_review_per_user = 10

multi_restaurant_users_sql = f'''
        SELECT
            u.user_id, COUNT(DISTINCT r.business_id) AS restaurant_review_count
        FROM
            users u
        INNER JOIN
            restaurant_reviews r
        ON u.user_id = r.user_id
        INNER JOIN
           restaurant_review_volume rrv
        ON r.business_id = rrv.business_id
        GROUP BY
            u.user_id
        HAVING
            COUNT(DISTINCT r.business_id) >= {min_review_per_user}
        ;
    '''

multi_restaurant_users = spark.sql(multi_restaurant_users_sql)
multi_restaurant_users.createOrReplaceTempView("multi_restaurant_users")


#Data for restaurant reviews for users with at least 2 reviews
final_reviews_sql = f'''
        SELECT
            r.user_id, ub.user_number, ub.average_stars, ub.elite, ub.fans, ub.friends, ub.name AS user_name, ub.review_count AS user_review_count, ub.yelping_since,
            b.*,
            r.review_id, r.date, r.stars AS review_stars, r.cool, r.funny, r.useful
        FROM
            restaurant_reviews r
        INNER JOIN
            multi_restaurant_users u
        ON r.user_id = u.user_id
        INNER JOIN
            users ub
        ON r.user_id = ub.user_id
        INNER JOIN
            restaurants b
        ON r.business_id = b.business_id
        ;
    '''

final_reviews = spark.sql(final_reviews_sql)

In [None]:
# Set some default values for certain fields
final_reviews = final_reviews.fillna(False, subset=["Open24Hours","DriveThru","GoodForDancing"])
final_reviews = final_reviews.fillna("no", subset=["Smoking"])
restaurants = restaurants.fillna(False, subset=["Open24Hours","DriveThru","GoodForDancing"])
restaurants = restaurants.fillna("no", subset=["Smoking"])

# Drop columns manually if not filled in sufficiently / appear irrelevant to prediction
cols_to_drop = ["is_open","longitude","latitude","address","BYOB","AcceptsInsurance","DietaryRestrictions","HairSpecializesIn",
                   "ByAppointmentOnly","RestaurantsCounterService","BYOBCorkage","CoatCheck","BusinessAcceptsBitcoin","AgesAllowed","cool","funny","useful", "state"]

final_reviews_drop = final_reviews.drop(*cols_to_drop)
restaurants_drop = restaurants.drop(*cols_to_drop)

In [None]:
#For each user, pull 70% for training and 30% for testing - do on user basis, so we can test user's recommendations
# Define the percentage for training and testing data
train_percentage = 0.7
test_percentage = 1 - train_percentage
seed = 42

#Random shuffle
shuffled_final = final_reviews_drop.orderBy(func.rand(42))

# Define window specification partitioned by the user_id column and corresponding row number
window_spec = Window.partitionBy("user_id").orderBy("user_id")
final_reviews_with_row_number = shuffled_final.withColumn("row_number", func.row_number().over(window_spec))

# Calculate the total number of rows for each user
user_counts = final_reviews_with_row_number.groupBy("user_id").count()

# Calculate the number of rows to include in the training set for each user
user_test_counts = user_counts.withColumn("test_count", (func.col("count") * test_percentage).cast("int"))

#Join row number data with test count
final_with_test_counts = final_reviews_with_row_number.join(user_test_counts, "user_id", "inner")

# Create training and testing DataFrames for each user
train_data = final_with_test_counts.filter(func.col("row_number") <= (func.col("count") - func.col("test_count")))
test_data = final_with_test_counts.filter(func.col("row_number") > (func.col("count") - func.col("test_count")))

# Drop the intermediate columns
train_data = train_data.drop("row_number", "count", "test_count")
test_data = test_data.drop("row_number", "count", "test_count")

#test_data.count()
#train_data.count()

In [None]:
def get_hit_rate(ground_truth_df, recommended_df, user_col, item_col):
    # Join ground truth and recommended items on user ID
    joined_df = ground_truth_df.join(recommended_df, on=user_col)

    # Filter to count hits (correct recommendations) for each user
    hits_df = joined_df.filter(func.col("recommended_" + item_col) == func.col(item_col)) \
                       .select(user_col).distinct()

    # Count the distinct users with at least one correct recommendation
    users_with_hits = hits_df.select(func.countDistinct(user_col)).collect()[0][0]

    # Count the total number of distinct users for whom recommendations were made
    total_users = recommended_df.select(func.countDistinct(user_col)).collect()[0][0]

    # Calculate total hit rate
    total_hit_rate = users_with_hits / total_users

    return total_hit_rate


def get_recall(ground_truth_df, recommended_df, user_col, item_col):
    # Join ground truth and recommended items on user ID
    joined_df = ground_truth_df.join(recommended_df, on=user_col)

    # Count the number of relevant recommended items and total relevant items for each user
    relevant_recommended_items = joined_df.filter(func.col("recommended_"+item_col) == func.col(item_col))
    total_relevant_items = ground_truth_df.groupBy(user_col).count()

    # Compute recall for each user
    recall_df = relevant_recommended_items.groupBy(user_col).count().withColumnRenamed("count", "relevant_count") \
        .join(total_relevant_items, on=user_col)
    recall_df = recall_df.withColumn("recall", func.col("relevant_count") / func.col("count"))

    # Average recall across all users
    average_recall = recall_df.selectExpr("avg(recall)").collect()[0][0]

    return average_recall

# Collaborative Filtering

In [None]:
## Use Daemon's
# #
# als_features = ['user_number','business_number','review_stars']
# cf_train_data = train_data.select(als_features)
# cf_test_data = test_data.select(als_features)

# #Build model
# als = ALS(userCol="user_number", regParam=0.1, itemCol="business_number", ratingCol="review_stars",
#           coldStartStrategy="drop")

# #Tune parameters; limited due to memory
# # paramGrid = ParamGridBuilder() \
# #     .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
# #     .build()

# evaluator = RegressionEvaluator(metricName="rmse", labelCol="review_stars", predictionCol="prediction")

# model = als.fit(cf_train_data)

# # crossval = CrossValidator(estimator=als,
# #                           estimatorParamMaps=paramGrid,
# #                           evaluator=evaluator,
# #                           numFolds=5)

# # cvModel = crossval.fit(cf_train_data)

In [None]:
# best_model = cvModel.bestModel
# best_rank = best_model.rank
# best_max_iter = best_model._java_obj.parent().getMaxIter()
# best_reg_param = best_model._java_obj.parent().getRegParam()

In [None]:
# #Evaluate the best model
# predictions = model.transform(cf_test_data)
# rmse = evaluator.evaluate(predictions)

In [None]:
# als_threshold_rating = 3.5
# filtered_predictions = predictions.filter(predictions["prediction"] >= als_threshold_rating).withColumnRenamed("business_number", "recommended_business_number")

# recall = get_recall(cf_test_data, filtered_predictions, "user_number", "business_number")
# hit_rate = get_hit_rate(cf_test_data, filtered_predictions, "user_number", "business_number")
# print("Root Mean Squared Error (RMSE):", rmse)
# print("Avg Recall:", recall)
# print("Hit Rate:", hit_rate)

Root Mean Squared Error (RMSE): 1.1654594490668244
Avg Recall: 0.7148329583417334
Total Recall: 0.689601250977326
Hit Rate: 1.0


# Content-Based

In [None]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer
from pyspark.sql.window import Window

#Pure restaurant information
#restaurants

# Filter only for 4+ reviews in training data - used at end, but not for cosine similarity calculation and remove user information except ID
# Won't need user information, because similarity is based on business atributes
review_4 = train_data.filter(func.col("review_stars") >= 4)
remove_user_attr = ["fans",'friends','elite',"average_stars","user_number","user_name","user_review_count","yelping_since"]
review_4 = review_4.drop(*remove_user_attr)

In [None]:
# Define input columns for each type
string_columns = ['categories', 'city', 'WiFi', 'RestaurantsAttire', 'Alcohol', 'NoiseLevel']
numeric_columns = ['stars','review_count','RestaurantsPriceRange2']
boolean_columns = ['Ambience_romantic',
    'Ambience_casual',
    'Ambience_trendy',
    'Ambience_intimate',
    'Ambience_hipster',
    'Ambience_upscale',
    'Ambience_divey',
    'Ambience_touristy',
    'Ambience_classy',
    'BusinessParking_valet',
    'BusinessParking_lot',
    'BusinessParking_validated',
    'BusinessParking_garage',
    'BusinessParking_street',
    'BestNights_sunday',
    'BestNights_thursday',
    'BestNights_monday',
    'BestNights_wednesday',
    'BestNights_saturday',
    'BestNights_friday',
    'BestNights_tuesday',
    'GoodForMeal_lunch',
    'GoodForMeal_brunch',
    'GoodForMeal_dinner',
    'GoodForMeal_latenight',
    'GoodForMeal_dessert',
    'GoodForMeal_breakfast',
    'Music_no_music',
    'Music_dj',
    'Music_live',
    'Music_karaoke',
    'Music_video',
    'Music_background_music',
    'Music_jukebox',
    'GoodForDancing','RestaurantsReservations','OutdoorSeating','HasTV','RestaurantsTakeOut','BusinessAcceptsCreditCards','Open24Hours','HappyHour','DriveThru',
    'WheelchairAccessible','Corkage','Caters','DogsAllowed','RestaurantsGoodForGroups','RestaurantsTableService','BikeParking','RestaurantsDelivery','GoodForKids']
boolean_df = review_4.select("*")


#Convert field types to appropriate type
to_num = ["RestaurantsPriceRange2"]
to_bool = ["GoodForDancing","RestaurantsReservations","OutdoorSeating","HasTV","RestaurantsTakeOut","BusinessAcceptsCreditCards","Open24Hours","HappyHour",
           "DriveThru","WheelchairAccessible","Corkage","Caters","DogsAllowed","RestaurantsGoodForGroups","RestaurantsTableService","BikeParking","RestaurantsDelivery","GoodForKids"]

#Convert to int
for c in to_num:
    restaurants_drop = restaurants_drop.withColumn(c, func.col(c).cast("int"))

# Convert to boolean
for c in to_bool:
    restaurants_drop = restaurants_drop.withColumn(c, func.when(func.col(c) == "True", True).otherwise(False))


# Remove fields with low variance
for c in boolean_columns:
  boolean_df = boolean_df.withColumn(c,func.col(c).cast('double'))

boolean_df = boolean_df.select(boolean_columns)

bool_variances = boolean_df.select([func.variance(func.col(c)).alias(c) for c in boolean_df.columns])
bool_variances.show()

remove_low_var = ["DriveThru",'Open24Hours',"Music_video","Music_no_music","Music_karaoke","GoodForDancing"]
review_4 = review_4.drop(*remove_low_var)
restaurants_drop = restaurants_drop.drop(*remove_low_var)

#Keeping some lower variance fields due to their ability to set restaurants apart - niche features ussers may be attracted to for a restaurant
#There are features above like karaoke or GoodForDancing that may be relevant for lounges or other types of businesses, but excluding, because we're focused more on dining / restaurants

In [None]:
#Update relevant boolean column list

boolean_columns = ['Ambience_romantic',
    'Ambience_casual',
    'Ambience_trendy',
    'Ambience_intimate',
    'Ambience_hipster',
    'Ambience_upscale',
    'Ambience_divey',
    'Ambience_touristy',
    'Ambience_classy',
    'BusinessParking_valet',
    'BusinessParking_lot',
    'BusinessParking_validated',
    'BusinessParking_garage',
    'BusinessParking_street',
    'BestNights_sunday',
    'BestNights_thursday',
    'BestNights_monday',
    'BestNights_wednesday',
    'BestNights_saturday',
    'BestNights_friday',
    'BestNights_tuesday',
    'GoodForMeal_lunch',
    'GoodForMeal_brunch',
    'GoodForMeal_dinner',
    'GoodForMeal_latenight',
    'GoodForMeal_dessert',
    'GoodForMeal_breakfast',
    'Music_dj',
    'Music_live',
    'Music_background_music',
    'Music_jukebox',
    'RestaurantsReservations','OutdoorSeating','HasTV','RestaurantsTakeOut','BusinessAcceptsCreditCards','HappyHour',
    'WheelchairAccessible','Corkage','Caters','DogsAllowed','RestaurantsGoodForGroups','RestaurantsTableService','BikeParking','RestaurantsDelivery','GoodForKids']

In [None]:

# String feature encoding
string_indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed", handleInvalid="keep") for col in string_columns]

# One-hot encoding for string features
encoder = OneHotEncoder(inputCols=[col+"_indexed" for col in string_columns],
                        outputCols=[col+"_encoded" for col in string_columns], handleInvalid="keep")

# VectorAssembler for numeric columns
numeric_assembler = VectorAssembler(inputCols=numeric_columns,
                                    outputCol="numeric_features", handleInvalid="keep")

# Normalizer for numeric features
normalizer = Normalizer(inputCol="numeric_features", outputCol="normalized_numeric_features")

# VectorAssembler for all features
assembler = VectorAssembler(inputCols=["normalized_numeric_features"] + [col for col in boolean_columns] + [col+"_encoded" for col in string_columns],
                            outputCol="features", handleInvalid="keep")

# Define pipeline
stages = string_indexers + [encoder, numeric_assembler, normalizer, assembler]
pipeline = Pipeline(stages=stages)

In [None]:
# Iterate over the boolean columns and apply the conversion logic
for col in boolean_columns:
    restaurants_drop = restaurants_drop.withColumn(col, func.when(restaurants_drop[col], 1).otherwise(0))

restaurants_drop = restaurants_drop.fillna(0, subset=numeric_columns)
restaurants_drop = restaurants_drop.fillna(0, subset=boolean_columns)

In [None]:
# Fit and transform the pipeline
#sample_df = restaurants_drop.sample(withReplacement=False,fraction=1000/restaurants_drop.count())
model = pipeline.fit(restaurants_drop)
transformed_df = model.transform(restaurants_drop)
# Define a UDF to fill null values in a vector
# def fill_null_with_zero(vector):
#     return Vectors.dense([0.0 if x is None else x for x in vector.toArray()])

# fill_null_with_zero_udf = udf(fill_null_with_zero, VectorUDT())
# transformed_df = transformed_df.withColumn("features", fill_null_with_zero_udf("features"))

transformed_df.show(truncate=False)

# Compute Cartesian product to get all pairs of businesses
cartesian_df = transformed_df.crossJoin(transformed_df.withColumnRenamed("business_id", "business_id2").withColumnRenamed("features", "features2"))

In [None]:
# Compute dot product of normalized feature vectors
dot_udf = func.udf(lambda x, y: float(x.dot(y)), "double")
dot_product_df = cartesian_df.withColumn("dot_product", dot_udf("features", "features2"))

# Compute magnitudes of feature vectors
magnitude_udf = func.udf(lambda x: float(x.norm(2)), "double")
magnitude_df = dot_product_df.withColumn("magnitude", magnitude_udf("features")) \
    .withColumn("magnitude2", magnitude_udf("features2"))

# Compute cosine similarity
cosine_similarity_df = magnitude_df.withColumn("cosine_similarity",
                                               func.col("dot_product") / (func.col("magnitude") * func.col("magnitude2")))

# Rank items by cosine similarity within each group (item)
window = Window.partitionBy("business_id").orderBy(func.col("cosine_similarity").desc())

In [None]:
# Select top N similar items
similarity_threshold = 0.7
top_similar_items = cosine_similarity_df \
    .filter(func.col("business_id") != func.col("business_id2")) \
    .select("business_id", "business_id2", "cosine_similarity") \
    .where(func.col("cosine_similarity") >= similarity_threshold)

In [None]:
# Show the recommendations
top_similar_items.show()

In [None]:
top_similar_items.count()

In [None]:
#Define ground truth and recommended df
ground_truth_df = test_data.select("user_id", "business_id")
recommended_df = review_4.select("user_id", "business_id").join(top_similar_items, on="business_id", how="left").select("user_id",'business_id2').withColumnRenamed("business_id2", "recommended_business_id").dropDuplicates()
# recommended_df.show()
# Calculate recall
recall = get_recall(ground_truth_df, recommended_df, "user_id", "business_id")
#total_recall = get_total_recall(ground_truth_df, recommended_df, "user_id", "business_id")
hit_rate = get_hit_rate(ground_truth_df, recommended_df, "user_id", "business_id")
print("Avg Recall:", recall)
#print("Total Recall:", total_recall)
print("Hit Rate:", hit_rate)

#Note, right now we're filtering for >=4 to use to make predictions - e.g., find restaurants similar to the ones that someone made a >=4 rating for,
# but our test / ground truth right now includes everything (include lower reviews), so if we're only recommending "good places", do we also need to just filter upfront for positive reviews before we do test/train split

#Do we want to do the content-based? If we're able to process review text, could maybe do key word trends / sentiment analysis as part of it

Not sure if we have enough information on the business to make informed decision - need to unnest some of the data because all my fields are captured under attributes. If we have enough attributes, can do content-based and knn to find "similar" restaurants

Have issues with memory on my computer for the model fitting... Didn't try to do CV because it's already struggling

If we use attributes, use PCA to cut down dimensions

Try to run in Google Colab

Content-based with text analysis of reviews (TFIDF)

Content-based with attributes unnested

Schedule

Step 1: Dataset Overview
The Yelp dataset includes several types of information, but for our content-based system, we'll focus on:
Businesses: Information about individual businesses, including their categories (e.g., restaurants, bars).
Reviews: Textual reviews written by users for businesses.
Step 2: Data Preprocessing
Filter Restaurants: From the businesses data, filter out to keep only those in the 'Restaurants' category.
Aggregate Reviews: For each restaurant, aggregate all its reviews into a single text document. This creates a comprehensive text representation of each restaurant's reviews.
Text Preprocessing: Clean and preprocess the aggregated review texts for each restaurant. This involves steps like converting to lowercase, removing punctuation and stop words, and possibly stemming or lemmatization.
Step 3: Feature Extraction
TF-IDF: Apply Term Frequency-Inverse Document Frequency (TF-IDF) vectorization to the preprocessed review texts. This converts the textual data into a numerical format that captures the importance of words in relation to the document they appear in and the entire corpus of review texts. The result is a sparse matrix where each row represents a restaurant and each column represents a term's TF-IDF score.
Step 4: User Profile Creation
Identify User Preferences: For the target user, identify a set of restaurants they have rated highly or reviewed positively.
Aggregate Preferred Reviews: Aggregate the review texts of these preferred restaurants to create a "preference profile" for the user. This text represents the kind of language and themes the user appears to favor in their preferred dining experiences.
Vectorize User Profile: Apply the same TF-IDF vectorization used on the restaurant reviews to this aggregated preference profile text. This results in a TF-IDF vector representing the user's preferences.
Step 5: Recommendation Generation
Calculate Similarity: Compute the similarity between the user's preference vector and each restaurant's TF-IDF vector. This can be done using cosine similarity, which measures the cosine of the angle between two vectors, providing a similarity score between 0 and 1.
Rank Restaurants: Rank the restaurants based on their similarity scores relative to the user's preference vector. Higher scores indicate a closer match to the user's preferences as inferred from their favored reviews.
Step 6: Provide Recommendations
Select Top-N: Select the top N restaurants with the highest similarity scores. These are the system's recommendations to the user, as they are the most similar to the user's expressed preferences based on past reviews.
Example Output
Assume our target user has shown a preference for restaurants with words like "cozy", "artisanal coffee", and "homemade pastries" frequently appearing in their positive reviews. The system might recommend restaurants whose aggregated reviews also frequently mention these terms, indicating a potential match in ambiance and offerings.
Considerations
Performance: Depending on the size of the dataset, steps like TF-IDF vectorization and similarity calculation can be computationally expensive. Techniques like dimensionality reduction or efficient similarity search algorithms can help mitigate this.
Cold Start Problem: New users or restaurants without sufficient reviews pose a challenge, as there's little data to base recommendations on. Hybrid approaches or content-based features not reliant on user interaction can help address this.
Diversity and Serendipity: Pure content-based systems might over-specialize in recommending items too similar to what the user has already experienced. Incorporating elements to introduce diversity and serendipity can enhance user satisfaction.

Performance - Use recall for content-based AND collaborative filtering; we also have RMSE


Content-based filtering is a recommendation technique that utilizes information about the items themselves to make recommendations. When evaluating the performance of a content-based filtering system, you can use various metrics to assess its effectiveness. Here are some common performance metrics for content-based filtering:

Precision and Recall: Precision measures the proportion of recommended items that are relevant to the user, while recall measures the proportion of relevant items that are recommended to the user. These metrics are typically calculated at different levels, such as precision@k and recall@k, where k is the number of recommended items.

F1 Score: The F1 score is the harmonic mean of precision and recall and provides a single metric that balances both precision and recall. It's calculated as
2
×
Precision
×
Recall
Precision
+
Recall
2×
Precision+Recall
Precision×Recall
​
 .

Mean Average Precision (MAP): MAP measures the average precision across all users. It considers both the relevance of recommended items and their rank in the list of recommendations.

Normalized Discounted Cumulative Gain (NDCG): NDCG evaluates the ranking quality of recommended items by considering both relevance and rank. It penalizes items that are ranked lower in the list of recommendations.

Mean Reciprocal Rank (MRR): MRR measures the average rank of the first relevant item in the list of recommendations. It provides an indication of how quickly relevant items are found.

Coverage: Coverage measures the proportion of items in the catalog that are recommended to at least one user. It provides insights into the diversity of recommendations.

Novelty: Novelty measures the degree to which recommended items are different from those that the user has interacted with before. It helps ensure that recommendations introduce users to new and interesting items.

User Satisfaction: User satisfaction can be assessed through user feedback, surveys, or other qualitative measures. It provides insights into whether users find the recommendations useful and relevant.

When evaluating a content-based filtering system, it's essential to consider a combination of these metrics to gain a comprehensive understanding of its performance. Additionally, the choice of metrics may depend on the specific goals and characteristics of the recommendation system and the domain in which it operates.

Schedule
Haley to commit data to GitHub for Daemon

Due: Apr 27

Try converting to parquet file first, but it might impact nested json

Tues @ 9 PM EST - sync; have models ready with recall calculation for each model

Paper (can start Wed and finish initial draft by Fri and final edits to submit Sat); do we want to meet Fri or Sat before submitting to review or just review / comment async?
- Preprocessing
  - Text analysis
  - Train / test split
- Analyze / compare results - does one model do better at recall for certain types of restaurants than the other; what about general performance (recall)
