In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies_path = 'dataset/movies_5000.csv'
ratings_path = 'dataset/ratings_5000.csv'

movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('distilbert-base-nli-mean-tokens').to(device)

In [7]:
names_and_genres = movies['title'].astype(str) + ',' + movies['AllGenres'].astype(str)
embedding = model.encode(names_and_genres, show_progress_bar=True)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [9]:
emb_arr = np.array(embedding)
cos_similarity = pd.DataFrame(cosine_similarity(emb_arr))

In [8]:
def recommend_similar_movies(cos_similarity, movie_index, k):
    all_index = cos_similarity.iloc[movie_index, :].sort_values(ascending=False).index.to_list()
    rec_index = all_index[1: k+1]
    rec_movie = movies['title'].iloc[rec_index]

    print(f'Watched movie: {movies.title.iloc[movie_index]} \t Genres: {movies.AllGenres.iloc[movie_index]}')
    print(f'Top {k} recommendations:')

    for m, movie in enumerate(rec_movie):
        print(f'{m+1}. {movie} \t Genres: {movies.AllGenres.iloc[rec_index[m]]}')

In [20]:
def recommend_with_history(history, model, movies, embeddings, k):
    
    movies_history = movies[movies['title'].isin(history)]
    movies_remaining = movies[~movies['title'].isin(history)]
    
    movies_history_emb = np.array([model.encode(title + ',' + genres) for title, genres in zip(movies_history['title'], movies_history['AllGenres'])])

    combined_emb = np.mean(movies_history_emb, axis=0)
    
    movies_remaining_emb = embeddings[movies_remaining.index]
    
    cos_simi = cosine_similarity([combined_emb], movies_remaining_emb)
    
    top_indices = np.argsort(-cos_simi[0])[:k]
    recommend_movies = movies_remaining.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    
    for index, movie in recommend_movies.iterrows():
        print(f'{index+1}. {movie["title"]} \t Genres: {movie["AllGenres"]}')

In [21]:
recommend_similar_movies(cos_similarity, movie_index=2500, k=5)

Watched movie: Behind Enemy Lines III: Colombia 	 Genres: War,Action,Thriller
Top 5 recommendations:
1. Bloodsport III 	 Genres: Action,Thriller
2. Man Hunt 	 Genres: Drama,Thriller,War
3. Echoes Of War 	 Genres: Western,Drama,Thriller
4. Brothers 	 Genres: Drama,Thriller,War
5. War of the Arrows 	 Genres: Drama,Action,History,Thriller


In [22]:
history = ['Bloodsport III', 'Man Hunt', 'Echoes Of War', 'Brothers', 'War of the Arrows']

recommend_with_history(history, model, movies, emb_arr, 5)

Top 5 recommendations:
3557. Zaytoun 	 Genres: Drama,War,Adventure,Thriller
2495. Battleground 	 Genres: Drama,Action,Thriller
1808. Merrill's Marauders 	 Genres: Action,Drama,War
2795. Kajaki 	 Genres: Thriller,War,Adventure,Drama
3440. Trigger Man 	 Genres: Action,Drama,Thriller,Horror
