### Content filter
- Items with same characteristics: watch 1 Tom Hanks movie, get more Tom Hanks movies recommended
- Requires item attribute info

### Collaborative filter:
- Similar items that user has liked: See movie about dinosaurs, see action or adventures moves recommended
- Requires user-item interaction data


In [198]:
import polars as pl

movies_df = pl.read_csv("ml-latest/movies.csv")

ratings_df = pl.read_csv("ml-latest/ratings.csv")

tags_df = pl.read_csv("ml-latest/tags.csv")

movies_df.shape, ratings_df.shape, tags_df.shape

((86537, 3), (33832162, 4), (2328315, 4))

In [None]:

movies_df = (
    movies_df.with_columns(
        
        pl.col("genres").str.replace_all(r"\|", ", "),          # Replace pipe "|" separators with commas (no list splitting)
        pl.col("title").str.extract_groups(
            r'^(?P<title>.*?)\s*\((?P<year>\d{4})\)\s*$'        # Extract title and year from the "title" column
        )
    )
    .unnest("title")                                            # Split into "title" and "year" columns
    .with_columns(

        pl.col("year").cast(pl.Int32, strict=False)
    )
)


movies_df = movies_df[:-17]                                     # cut off tail to be merged with other dfs

movies_df

movieId,title,year,genres
i64,str,i32,str
1,"""Toy Story""",1995,"""Adventure, Animation, Children…"
2,"""Jumanji""",1995,"""Adventure, Children, Fantasy"""
3,"""Grumpier Old Men""",1995,"""Comedy, Romance"""
4,"""Waiting to Exhale""",1995,"""Comedy, Drama, Romance"""
5,"""Father of the Bride Part II""",1995,"""Comedy"""
…,…,…,…
288923,"""Maximum Truth""",2023,"""Comedy"""
288927,,,"""Comedy"""
288931,"""V for Vengeance""",2022,"""Action, Horror"""
288935,"""Kosovolove""",2000,"""(no genres listed)"""


In [None]:
tags_df = tags_df.with_columns(pl.from_epoch(pl.col("timestamp"), time_unit="s"))


tags_df = (
    tags_df
    .group_by("movieId")
    .agg(

        pl.col("tag").str.concat(", ").alias("tag_list")
    )
    .sort("movieId")
)


tags_df = tags_df[:-1]                                                          # cut off to better merge with other dfs

tags_df

movieId,tag_list
i64,str
1,"""animation, friendship, toys, a…"
2,"""animals, based on a book, fant…"
3,"""sequel, moldy, old, old age, o…"
4,"""characters, chick flick, girl …"
5,"""family, pregnancy, wedding, 4t…"
…,…
288753,"""deep web, horror, thriller"""
288765,"""post-apocalyptic, survival, tw…"
288779,"""Don Camillo Series"""
288849,"""addiction, animation, short fi…"


In [None]:

ratings_df = (
    ratings_df
    .group_by("movieId")
    .agg(
        pl.col("rating").mean().alias("average_rating")
    )
    .sort("movieId")
)

ratings_df   

movieId,average_rating
i64,f64
1,3.893508
2,3.278179
3,3.171271
4,2.868395
5,3.076957
…,…
288967,3.5
288971,0.5
288975,4.0
288977,3.0


In [None]:
combined_df = (
    tags_df
    .join(ratings_df, on="movieId")
    .join(movies_df, on="movieId")
    .drop_nulls()
    .sort("movieId")
)

combined_df = combined_df.filter(pl.col("genres") != "(no genres listed)")

In [204]:
def text_representation(row: dict) -> str:
    return (
        f"""Title: {row['title']},
Year: {row['year']}, 
Genre: {row['genres']}, 
Tags: {row['tag_list']},
Ratings: {row['average_rating']}"""
    )

In [None]:
combined_df = combined_df.with_columns([
    pl.struct(["tag_list", "average_rating", "title", "year", "genres"])
      .map_elements(lambda x: text_representation(x))  
      .alias("embedding_text")
])

  combined_df = combined_df.with_columns([


In [None]:

combined_df


combined_df.write_csv("clean_data.csv")


In [None]:
combined_df = combined_df.filter((pl.col("year") >= 1950) & (pl.col("average_rating") >= 3.0) )

combined_df.shape


(27646, 7)

In [208]:
combined_df["embedding_text"][3]

"Title: Father of the Bride Part II,\nYear: 1995, \nGenre: Comedy, \nTags: family, pregnancy, wedding, 4th wall, aging, baby, daughter, Diane Keaton, family, father, father - child relationship, fourth wall, growing old, gynecologist, heartwarming, humorous, midlife crisis, narration, parent child relationship, pregnancy, regret, seen 2021, seen more than once, sentimental, sequel, Steve Martin, humorous, steve martin, childhood classics, it thought it was funny but it wasn't, watched under duress, worst movies ever, Diane Keaton, family, sequel, Steve Martin, wedding, steve martin, steve martin, childhood classics, steve martin, Comedy, Diane Keaton, family, pregnancy, steve martin, Touching, sequel fever, CLV, Steve Martin, aging, baby, confidence, contraception, daughter, gynecologist, midlife crisis, parent child relationship, pregnancy, pregnancy, remake, Comedy, Touching, steve martin, Steve Martin, family, Steve Martin, Fantasy,\nRatings: 3.0769571546104677"

In [209]:
import faiss
import numpy as np

dim = 384
index = faiss.IndexFlatL2(dim)
X = np.zeros((len(combined_df["embedding_text"]), dim), dtype="float32")



In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"                     # Enable GPU acceleration
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

model = model.half()                                                        # Reduces memory usage by 50%

batch_size = 128                                                           


loader_params = {
    "batch_size": batch_size,
    "num_workers": 12,        
    "pin_memory": True,      
    "persistent_workers": True
}


embeddings = model.encode(
    combined_df["embedding_text"].to_numpy(),
    **loader_params,
    convert_to_numpy=True,   
    show_progress_bar=True,
    normalize_embeddings=True  
)
# Add to index
index.add(embeddings)

Batches: 100%|██████████| 216/216 [00:13<00:00, 16.39it/s]


In [211]:
# Save vector database

faiss.write_index(index, "index-vdb")



In [212]:
# import faiss

# # Load vector database

# index = faiss.read_index("index-vdb")



In [213]:
df = combined_df

df.filter(pl.col("title").str.contains("Shutter"))

movieId,tag_list,average_rating,title,year,genres,embedding_text
i64,str,f64,str,i32,str,str
45003,"""easily confused with other mov…",3.423658,"""Shutter""",2004,"""Fantasy, Horror, Mystery, Thri…","""Title: Shutter, Year: 2004, G…"
74458,"""Leonardo DiCaprio, psychologic…",4.002853,"""Shutter Island""",2010,"""Drama, Mystery, Thriller""","""Title: Shutter Island, Year: 2…"
127632,"""curse, fire, lighthouse, mill,…",3.0,"""The Shuttered Room""",1967,"""Horror""","""Title: The Shuttered Room, Yea…"
149026,"""car driver, city, friendship, …",3.6,"""Shutter""",2013,"""Thriller""","""Title: Shutter, Year: 2013, G…"


In [214]:
fav_movie = df.filter(pl.col("movieId") == 74458)
fav_movie.glimpse()

Rows: 1
Columns: 7
$ movieId        <i64> 74458
$ tag_list       <str> "Leonardo DiCaprio, psychological, twist ending, asylum, mystery, story, twist ending, mystery, thought-provoking, twist ending, plot twist, psychological, twist ending, clever, mindfuck, stylized, thought-provoking, twist ending, cinematography, ending twist, Leonardo DiCaprio, Martin Scorsese, mindfuck, shocking ending, thought-provoking, twist ending, World War II, action, ending twist, Predictable, psychological, story, too long, Leonardo DiCaprio, Martin Scorsese, mental illness, mystery, plot twist, predictable, asylum, atmospheric, cinematography, clever, ending twist, insanity, intense, Leonardo DiCaprio, Mental Institution, mentali illness, mindfuck, mystery, plot twist, psychological, Psychological Thriller, reality or imagination?, story, stylized, thought-provoking, too long, twist ending, World War II, psychological, twist ending, twist ending, acting, asylum, atmospheric, Ben Kingsley, child killing, c

In [None]:

embedding = model.encode(
    [fav_movie["embedding_text"].item()],  
    convert_to_tensor=False
).astype('float32')



D, I = index.search(embedding, 5)



In [None]:

best_indices = I.flatten().tolist()


best_matches = df.select(
    pl.col("embedding_text").gather(best_indices)
).to_series().to_numpy()

In [217]:
for match in best_matches:
    print("NEXT MOVIE")
    print(match)
    print()

NEXT MOVIE
Title: Shutter Island,
Year: 2010, 
Genre: Drama, Mystery, Thriller, 
Tags: Leonardo DiCaprio, psychological, twist ending, asylum, mystery, story, twist ending, mystery, thought-provoking, twist ending, plot twist, psychological, twist ending, clever, mindfuck, stylized, thought-provoking, twist ending, cinematography, ending twist, Leonardo DiCaprio, Martin Scorsese, mindfuck, shocking ending, thought-provoking, twist ending, World War II, action, ending twist, Predictable, psychological, story, too long, Leonardo DiCaprio, Martin Scorsese, mental illness, mystery, plot twist, predictable, asylum, atmospheric, cinematography, clever, ending twist, insanity, intense, Leonardo DiCaprio, Mental Institution, mentali illness, mindfuck, mystery, plot twist, psychological, Psychological Thriller, reality or imagination?, story, stylized, thought-provoking, too long, twist ending, World War II, psychological, twist ending, twist ending, acting, asylum, atmospheric, Ben Kingsley, c

In [218]:
ddd = pl.read_csv("normalized_tags.csv")
ddd = ddd["cleaned_tags"].unique()
dd = pl.col("cleaned_tags").is_not_null()
dd



# dd.write_csv("normalized_tags.csv")


In [221]:
df.write_csv("clean_data.csv")