In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
from scipy.sparse import save_npz

In [3]:
df_movies = pd.read_parquet(r'C:\Projects\Recomender System\data\processed\movies_for_content_based.parquet')
df_movies.head()

Unnamed: 0,movieId,clean_title,clean_genres,text
0,296,Pulp Fiction,Comedy Crime Drama Thriller,Pulp Fiction Comedy Crime Drama Thriller
1,306,Three Colors: Red (Trois couleurs: Rouge),Drama,Three Colors: Red (Trois couleurs: Rouge) Drama
2,307,Three Colors: Blue (Trois couleurs: Bleu),Drama,Three Colors: Blue (Trois couleurs: Bleu) Drama
3,665,Underground,Comedy Drama War,Underground Comedy Drama War
4,899,Singin' in the Rain,Comedy Musical Romance,Singin' in the Rain Comedy Musical Romance


In [4]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

tfidf_matrix = tfidf.fit_transform(df_movies['text'])
tfidf_matrix.shape

(59047, 5000)

In [5]:
indices = pd.Series(df_movies.index, index=df_movies['movieId']).drop_duplicates()

In [6]:
def recommend_movies(movieId, top_n=10):
    if movieId not in indices:
        return f"MovieId {movieId} not found in the dataset."

    idx = indices[movieId]
    movie_vec = tfidf_matrix[idx]
    sim_scores = cosine_similarity(movie_vec, tfidf_matrix)[0]
    similar_indices = np.argsort(sim_scores)[::-1]
    similar_indices = similar_indices[similar_indices != idx]
    top_indices = similar_indices[:top_n]
    
    return df_movies.iloc[top_indices][['movieId', 'clean_title', 'clean_genres']]

In [7]:
recommend_movies(1, top_n=10)

Unnamed: 0,movieId,clean_title,clean_genres
174,3114,Toy Story 2,Adventure Animation Children Comedy Fantasy
6648,201588,Toy Story 4,Adventure Animation Children Comedy
652,78499,Toy Story 3,Adventure Animation Children Comedy Fantasy IMAX
9272,120474,Toy Story That Time Forgot,Animation Children
13336,106022,Toy Story of Terror,Animation Children Comedy
13458,4929,"Toy, The",Comedy
15,2161,"NeverEnding Story, The",Adventure Children Fantasy
30762,169092,The Story of the Voyages,Adventure Children Fantasy
49516,153234,Toy Reanimator,Fantasy Sci-Fi
15680,115879,Toy Story Toons: Small Fry,Adventure Animation Children Comedy Fantasy


In [9]:
with open(r"C:\Projects\Recomender System\models\tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

save_npz(r"C:\Projects\Recomender System\models\tfidf_matrix.npz", tfidf_matrix)

df_movies.to_parquet(r'C:\Projects\Recomender System\models\movies_for_content_based.parquet', index=False)