In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Reading csv files
reviews = pd.read_json('movie_dataset_public_final/raw/reviews.json', lines=True)
ratings = pd.read_csv('ml-32m/ratings.csv')

In [4]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [5]:
ratings.movieId.unique()

array([    17,     25,     29, ..., 175771, 157917, 274343])

In [6]:
# Checking if there are common items in both datasets
# aa is the set of common items
aa = set(reviews.item_id.unique()) & set(ratings.movieId.unique())

In [7]:
# Find intersection of reviews.item_id.unique() and ratings.movieId.unique()

reviews = reviews[reviews.item_id.isin(aa)]
ratings = ratings[ratings.movieId.isin(aa)]

In [8]:
# Write ratings to csv
ratings.to_csv('ratings_common.csv', index=False)

In [9]:
reviews.head(5)

Unnamed: 0,item_id,txt
0,172063,"one-shot record of a belly dancer; ""Carmencita..."
2,7065,unbelievable; I cannot understand how anyone c...
3,3739,I'm still starry-eyed from it; I saw this last...
4,1562,Failed on every Front; Joel Schumacher who did...
5,8977,"Poor Casting, Poor script Poor direction.; I h..."


In [10]:
# Grouping reviews by item_id and make a list of reviews about that movie
# We will only consider the first 50 reviews for each movie
reviews = reviews.groupby('item_id').agg({'txt':list}).reset_index()
reviews['txt'] = reviews['txt'].apply(lambda x: x[:50])

In [11]:
# Extracting embeddings for each review and taking the average of all reviews for each movie
def get_embedding(review, model):
    embeds = model.encode(review)

    return np.mean(embeds, axis=0).tolist()

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [12]:
from tqdm import tqdm

In [13]:
tqdm.pandas()
# Extracting embeddings for each movie
reviews["embed"] = reviews.progress_apply(lambda x: get_embedding(x['txt'], model), axis=1)

100%|██████████| 46236/46236 [15:11<00:00, 50.73it/s] 


In [15]:
# Save reviews embeddings without txt column as csv

reviews.drop(columns=['txt']).to_csv('reviews_embeddings.csv', index=False)

In [None]:
# Restore saved embeddings file and check the first 5 rows
saved_embeddings = pd.read_csv('reviews_embeddings.csv')

saved_embeddings.head(5)

Unnamed: 0,item_id,embed
0,1,"[-0.06131595000624657, -0.026167748495936394, ..."
1,2,"[-0.03323860466480255, 0.018439408391714096, 0..."
2,3,"[-0.05272774398326874, -0.05238880217075348, 0..."
3,4,"[-0.04218399152159691, -0.039693575352430344, ..."
4,5,"[-0.06244966387748718, -0.024950290098786354, ..."
