In [83]:
!pip install scikit-surprise

import pandas as pd

movies = pd.read_csv(r"movies.csv")

Access is denied.


In [62]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [63]:
import re 

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [64]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [66]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [None]:
from sklearn.neighbors import NearestNeighbors
import joblib
from scipy.sparse import csr_matrix

tfidf_sparse = csr_matrix(tfidf)

nn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='ball_tree')
nn_model.fit(tfidf_sparse)

def search_optimized(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    distances, indices = nn_model.kneighbors(query_vec)
    results= movies.iloc[indices.flatten()][::-1]
    return results

In [None]:
joblib.dump(nn_model, 'nn_model.pkl')
nn_model = joblib.load('nn_model.pkl')

In [67]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # display(data)
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [68]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [None]:
tags = pd.read_csv("tags.csv")
movies_with_tags = movies.merge(tags.groupby('movieId')['tag'].apply(' '.join).reset_index(), on='movieId', how='left')
movies_with_tags['tag'].fillna('', inplace=True)

tag_vectorizer = TfidfVectorizer()
tag_tfidf = tag_vectorizer.fit_transform(movies_with_tags['tag'])

def search_with_tags(title):
    title_results = search_optimized(title)
    title_indices = title_results.index.tolist()
    
    title_clean = clean_title(title)
    query_vec = vectorizer.transform([title_clean])
    title_sim_scores = cosine_similarity(query_vec, tag_tfidf).flatten()
    
    query_tag = tag_vectorizer.transform([title_clean])
    tag_sim = cosine_similarity(query_tag, tag_tfidf).flatten()
    
    combined_scores = (title_sim_scores + tag_sim) / 2
    indices = np.argpartition(combined_scores, -5)[-5:]
    return movies.iloc[indices][::-1]

In [69]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [70]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()
    
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > .1]

    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    

In [None]:
from surprise import Dataset, Reader, SVD

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

def find_similar_movies_hybrid(movie_id, user_id=None):
    if user_id:
        predictions = [algo.predict(user_id, mid) for mid in movies['movieId']]
        top_preds = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]
        recs = [movies[movies['movieId'] == pred.iid]['title'].values[0] for pred in top_preds]
        
    else:
        recs = find_similar_movies(movie_id)['title'].tolist()
        
    movie_genres = movies[movies['movieId'] == movie_id]['genres'].values[0].split('|')
    genre_recs = movies[movies['genres'].str.contains('|'.join(movie_genres), na=False)]['title'].head(5).tolist()
    
    return list(set(recs + genre_recs))[:10]

In [None]:
from surprise.model_selection import cross_validate

results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose = True)
print(f"Average RMSE: {results['test_rmse'].mean()}") 

In [None]:
movie_name_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)

genre_dropdown = widgets.Dropdown(
    options=['All'] + list(movies['genres'].str.split('|').explode().unique()),
    value='All',
    description='Genre:'
)

recommendation_list = widgets.Output()

def on_type_with_filter(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        genre = genre_dropdown.value
        if len(title) > 5:
            results = search_optimized(title)
            if genre != 'All':
                results = results[results['genres'].str.contains(genre, na=False)]
            if len(results) > 0:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies_hybrid(movie_id))
            
movie_name_input.observe(on_type_with_filter, names="value")
genre_dropdown.observe(on_type_with_filter, names="value")
display(movie_name_input, genre_dropdown, recommendation_list)

Text(value='', description='Movie Title:')

Output()