Content-Based Filtering

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
import ipywidgets as widgets

content_df = pd.read_csv("/content/cleaned_movies_content.csv")
ratings_df = pd.read_csv("/content/cleaned_ratings.csv")

rated_ids = ratings_df['movieId'].unique()
content_df = content_df[content_df['id'].isin(rated_ids)].copy()
content_df = content_df.head(5000).reset_index(drop=True)

content_df['content'] = content_df['content'].fillna('')

tfidf = TfidfVectorizer(
    stop_words='english',
    max_df=0.8,
    ngram_range=(1, 1),
    max_features=10000,
    norm='l2'
)

tfidf_matrix = tfidf.fit_transform(content_df['content'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

content_df['title_clean'] = content_df['title'].str.strip().str.lower()
title_to_index = pd.Series(content_df.index, index=content_df['title_clean'])

def get_content_recommendations(title, top_n=10):
    title_clean = title.strip().lower()
    if title_clean not in title_to_index:
        return f"❌ Movie '{title}' not found in dataset."
    idx = title_to_index[title_clean]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return content_df[['title']].iloc[movie_indices].reset_index(drop=True)

movie_input = widgets.Text(
    value='The Dark Knight',
    placeholder='Enter a movie title',
    description='Movie:',
    disabled=False
)

output = widgets.Output()

def on_submit(change):
    output.clear_output()
    with output:
        recommendations = get_content_recommendations(change.new)
        display(recommendations)

movie_input.observe(on_submit, names='value')
display(movie_input, output)

Text(value='The Dark Knight', description='Movie:', placeholder='Enter a movie title')

Output()

Collaborative-Based Filtering

In [2]:
print("content_df exists:", 'content_df' in locals())
print("tfidf exists:", 'tfidf' in locals())
print("cosine_sim exists:", 'cosine_sim' in locals())

content_df exists: True
tfidf exists: True
cosine_sim exists: True


In [3]:
import pickle
from google.colab import files

with open("content_df.pkl", "wb") as f:
    pickle.dump(content_df, f)
files.download("content_df.pkl")

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
files.download("tfidf_vectorizer.pkl")

with open("cosine_similarity_matrix.pkl", "wb") as f:
    pickle.dump(cosine_sim, f)
files.download("cosine_similarity_matrix.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>