In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed

DATASET_PATH = "D:\\Datasets\\MovieLens32M\\"

# disregard pandas warnings
pd.options.mode.chained_assignment = None

  from tqdm.autonotebook import tqdm


review embeddings are gathered in review_embeddings.ipynb notebook

In [2]:
ratings = pd.read_csv(DATASET_PATH + "ratings.csv").drop(columns=["timestamp"])
review_embeddings = pd.read_csv(DATASET_PATH + "reviews_embeddings.csv").rename(columns={"item_id": "movieId"})

In [3]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0
...,...,...,...
32000199,200948,79702,4.5
32000200,200948,79796,1.0
32000201,200948,80350,0.5
32000202,200948,80463,3.5


In [4]:
review_embeddings.head()

Unnamed: 0,movieId,embed
0,1,"[-0.06131595000624657, -0.026167748495936394, ..."
1,2,"[-0.03323860466480255, 0.018439408391714096, 0..."
2,3,"[-0.05272774398326874, -0.05238880217075348, 0..."
3,4,"[-0.04218399152159691, -0.039693575352430344, ..."
4,5,"[-0.06244966387748718, -0.024950290098786354, ..."


In [5]:
# filter the rating id to only include the ones that have a review
ratings = ratings[ratings["movieId"].isin(review_embeddings["movieId"])]

In [6]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0
...,...,...,...
32000199,200948,79702,4.5
32000200,200948,79796,1.0
32000201,200948,80350,0.5
32000202,200948,80463,3.5


In [7]:
def parse_embedding(embedding_str):
    return np.array(embedding_str[1:-1].split(", ")).astype(np.float32)

review_embeddings["embed"] = review_embeddings["embed"].apply(parse_embedding)

In [8]:
review_embeddings.loc[0, "embed"]

array([-6.13159500e-02, -2.61677485e-02,  5.45579642e-02, -4.91172560e-02,
       -2.68552825e-02,  4.10389788e-02,  3.97612080e-02,  2.52327379e-02,
        1.45071447e-02,  1.43586402e-03, -1.30720213e-02,  5.53099588e-02,
        3.63318026e-02,  4.30244654e-02,  1.92954149e-02,  3.60613689e-02,
        7.04567432e-02, -5.21029755e-02,  4.23779860e-02, -4.90425192e-02,
        4.40297574e-02, -1.99645553e-02,  6.27430901e-02, -1.28555410e-02,
       -4.43590209e-02,  3.73669825e-02, -1.03510545e-04, -2.79518254e-02,
       -6.40946031e-02, -3.06767169e-02,  2.54042037e-02,  5.00780717e-02,
       -9.11142910e-04, -4.24804129e-02, -3.08895260e-02,  3.50595601e-02,
        2.75797416e-02, -4.52351570e-02, -2.52276994e-02, -2.22335272e-02,
       -2.36212127e-02,  3.00305150e-02,  1.28817931e-02, -4.07569408e-02,
        1.65556874e-02, -4.29368056e-02, -2.07180865e-02, -1.05340384e-01,
        3.03175207e-02,  1.28169302e-02, -4.25188169e-02, -5.66814793e-03,
        7.89260641e-02, -

### Recommendation with item-item collaborative filtering using embeddings as similarity criteria

In [9]:
train_set, test_set = train_test_split(ratings, random_state=42)

In [10]:
# caching several variables to speed up the process
user_means = train_set.groupby("userId")["rating"].mean().to_dict()
movie_means = ratings.groupby("movieId")["rating"].mean().to_dict()
ratings_mean = ratings["rating"].mean()
movie_embeddings = review_embeddings.set_index("movieId")["embed"].to_dict()

In [17]:
def predict(row, k=10, hybrid=False):
    user_id = row["userId"]
    movie_id = row["movieId"]

    # global prediction
    user_mean = user_means.get(user_id, ratings_mean)
    movie_mean = movie_means.get(movie_id, ratings_mean)
    user_deviation = ratings_mean - user_mean
    movie_deviation = ratings_mean - movie_mean
    baseline = ratings_mean + user_deviation + movie_deviation

    # local prediction
    user_movies = train_set[train_set["userId"] == user_id]
    if len(user_movies) >= k and movie_id in movie_embeddings:
        movie_embedding = np.array(movie_embeddings[movie_id])
        user_movie_embeddings = review_embeddings[
            review_embeddings["movieId"].isin(user_movies["movieId"])
        ].copy()
        user_movie_embeddings["similarity"] = np.dot(
            np.vstack(user_movie_embeddings["embed"].values), movie_embedding
        )
        top_k_movies = user_movie_embeddings.nlargest(k, "similarity").merge(user_movies, on="movieId")
        local_prediction = np.average(top_k_movies["rating"], weights=top_k_movies["similarity"])

        if hybrid:
            prediction = 0.3 * baseline + 0.7 * local_prediction
        else:
            prediction = local_prediction
    else:
        prediction = baseline

    return prediction, row["rating"]


### Experiments

k=10, hybrid=False

In [None]:
k = 10
predictions = []
labels = []

for _, row in tqdm(test_set.iterrows(), total=len(test_set)):
    prediction, label = predict(row, k=k)
    predictions.append(prediction)
    labels.append(label)

In [14]:
print("Results:")
print("RMSE:", np.sqrt(np.mean((np.array(predictions) - np.array(labels)) ** 2)))
print("MAPE:", np.mean(np.abs(np.array(predictions) - np.array(labels)) / np.array(labels)))

Results:
RMSE: 0.8854088741832823
MAPE: 0.2947690360080843


k=10, hybrid=True

In [18]:
k = 10
predictions = []
labels = []

for _, row in tqdm(test_set.iterrows(), total=len(test_set)):
    prediction, label = predict(row, k=k, hybrid=True)
    predictions.append(prediction)
    labels.append(label)

100%|██████████| 30984/30984 [16:28<00:00, 31.35it/s]


In [19]:
print("Results:")
print("RMSE:", np.sqrt(np.mean((np.array(predictions) - np.array(labels)) ** 2)))
print("MAPE:", np.mean(np.abs(np.array(predictions) - np.array(labels)) / np.array(labels)))

Results:
RMSE: 0.9836644005799349
MAPE: 0.35131077268010785
