In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import joblib

In [2]:
df = pd.read_csv('movies_with_ratings.csv')
df.head()

Unnamed: 0,movieId,title,genres,avg_rating,num_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7.0
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49.0


In [4]:
tfidf = TfidfVectorizer(token_pattern=r'[a-zA-Z]+')
genre_matrix = tfidf.fit_transform(df['genres'])

In [5]:
genre_similarity = cosine_similarity(genre_matrix)

In [8]:
# test
def recommend_movies(movie_title, df=df):
    # movie index
    movie_idx = df[df['title'].str.contains(movie_title, case=False, regex=False)].index
    if len(movie_idx) == 0:
        return f"No movies found for '{movie_title}'."
    movie_idx = movie_idx[0]

    sim_scores = linear_kernel(genre_matrix[movie_idx], genre_matrix).flatten()
    top_indices = sim_scores.argsort()[-6:-1][::-1]

    recommendations = [df.iloc[i]['title'] for i in top_indices]
    return recommendations

In [9]:
recommend_movies("Toy Story")

["Emperor's New Groove, The (2000)",
 'Antz (1998)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Wild, The (2006)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)']

In [11]:
joblib.dump(tfidf, 'models/tfidf_vectorizer.joblib')

['models/tfidf_vectorizer.joblib']

In [12]:
joblib.dump(genre_matrix, 'models/genre_matrix.joblib')

['models/genre_matrix.joblib']