<a href="https://colab.research.google.com/github/harsh-154/Movie-Recommendation/blob/main/MovieRecommendationCinematch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
moviesPath="/content/drive/MyDrive/ml-25m/movies.csv"
movies = pd.read_csv(moviesPath)

In [None]:
movies.head()

In [None]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)


In [None]:
movies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

In [None]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [None]:
ratingsPath="/content/drive/MyDrive/ml-25m/ratings.csv"
ratings = pd.read_csv(ratingsPath)

In [None]:
ratings.dtypes

In [None]:
# import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Step 1: Filter the ratings dataset (USE THIS, not original `ratings`)
active_users = ratings['userId'].value_counts()
active_users = active_users[active_users > 200].index

popular_movies = ratings['movieId'].value_counts()
popular_movies = popular_movies[popular_movies > 500].index

filtered_ratings = ratings[ratings['userId'].isin(active_users) & ratings['movieId'].isin(popular_movies)]

# Step 2: Build a manageable pivot table
user_movie_matrix = filtered_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
print(f"Matrix shape: {user_movie_matrix.shape}")


In [None]:
from sklearn.neighbors import NearestNeighbors

# Use sparse matrix if still large (optional)
from scipy.sparse import csr_matrix
user_movie_sparse = csr_matrix(user_movie_matrix.values)

# Fit KNN
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_movie_sparse)

# Choose a user
target_user_index = 0  # first user
target_user_id = user_movie_matrix.index[target_user_index]

# Find similar users
distances, indices = knn.kneighbors([user_movie_matrix.iloc[target_user_index]], n_neighbors=6)

similar_users = user_movie_matrix.index[indices.flatten()[1:]]

# Recommend based on similar users
similar_users_ratings = filtered_ratings[filtered_ratings['userId'].isin(similar_users)]
rated_by_target = filtered_ratings[filtered_ratings['userId'] == target_user_id]['movieId'].tolist()

unseen_movies = similar_users_ratings[~similar_users_ratings['movieId'].isin(rated_by_target)]

recommendations = (unseen_movies.groupby('movieId')['rating']
                   .mean()
                   .sort_values(ascending=False)
                   .head(10)
                   .reset_index()
                   .merge(movies, on='movieId')[['title', 'rating']])

print("Top 10 movie recommendations:")
print(recommendations)


In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages

In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [None]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)