In [1]:
import os
from pathlib import Path
from tqdm.auto import tqdm
import json
import pickle as p

path = Path(os.getenv("MOVIE_LENS_DATASET"))

In [2]:
import pandas as pd
import numpy as np

ratings = pd.read_csv(path/"ratings.csv")
movies = pd.read_csv(path/"movies.csv")
links = pd.read_csv(path/"links.csv")

movies = pd.merge(movies, links, "left", "movieId")

avg_rating = pd.DataFrame(ratings.groupby("movieId").rating.mean() * 2)
avg_rating = avg_rating.rename(columns={"rating": "avg_rating"})

movies = pd.merge(movies, avg_rating, on="movieId")

In [3]:
df = pd.DataFrame.copy(ratings)

df["rating"] *= 2
assert (df["rating"] == df["rating"].astype(int)).all()
df["rating"] = df["rating"].astype(int)

# hyperparameter for "like" threshold

# rating_threshold = 6
# rating_threshold = 7
# rating_threshold = 8
rating_threshold = 9
# rating_threshold = 10

rating_mask = df.rating >= rating_threshold
print("rating_mask.mean():", rating_mask.mean())

df = df[rating_mask]

df = df.sort_values("rating", ascending=False).reset_index(drop=True)

num_votes = df.movieId.value_counts()

has_vote = movies.movieId.isin(num_votes.index)

has_vote.mean()

rating_mask.mean(): 0.23885428900464595


np.float64(0.5145424620670599)

In [4]:
movies["num_votes"] = 0
movies.loc[has_vote, "num_votes"] = num_votes[movies[has_vote].movieId.tolist()].tolist()
movies = movies.sort_values("num_votes", ascending=False).reset_index(drop=True)
movies = movies.set_index("movieId", drop=False)

In [5]:
userId = df.userId.astype("category").cat.codes.to_numpy(np.int64)

movieId_cat = df.movieId.astype("category")

movieId2idx = {movieId: i for i, movieId in enumerate(movieId_cat.cat.categories)}

movieId = movieId_cat.cat.codes.to_numpy(np.int64)

if rating_threshold == 0:
    assert movies.movieId.apply(lambda x: x in movieId2idx).all()

rating = df.rating.to_numpy(np.int64)

In [6]:
movies_normalized = pd.DataFrame.copy(movies).reset_index(drop=True)
movies_normalized["movieId"] = movies_normalized["movieId"].apply(lambda x: movieId2idx[x] if x in movieId2idx else -1)
movies_normalized = movies_normalized[movies_normalized.movieId >= 0]
movies_normalized = movies_normalized.sort_values("movieId", ignore_index=True)
movies_normalized = movies_normalized.set_index("movieId")

assert (movies_normalized.index == np.arange(len(movies_normalized))).all()

movies_normalized

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,avg_rating,num_votes
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.787015,25417
1,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,6.556357,3531
2,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,6.342541,1673
3,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,5.736790,225
4,Father of the Bride Part II (1995),Comedy,113041,11862.0,6.153914,1364
...,...,...,...,...,...,...
42825,Night Train (2023),Action|Crime|Thriller,15727212,1007427.0,10.000000,1
42826,Farewell Mister Haffmann (2022),Drama,10545704,670243.0,9.000000,1
42827,V for Vengeance (2022),Action|Horror,15745084,982543.0,9.000000,1
42828,Wedding Season (2023),Comedy|Romance,27689885,1119748.0,8.500000,1


In [7]:
os.makedirs("../cached_data/", exist_ok=True)

p.dump({
    "userId": userId,
    "movieId": movieId,
    "rating": rating,
    
    "movies": movies_normalized
}, open("../cached_data/movie_lens_preprocessed.p", "wb"))