In [1]:
import pandas as pd
import numpy as np
import re
import ipywidgets as widgets
from IPython.display import display, clear_output
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


# Cleaning titles for the engine

In [4]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama,State of Siege Temple Attack 2021
86533,288971,Ouija Japan (2021),Action|Horror,Ouija Japan 2021
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary,The Men Who Made the Movies Howard Hawks 1973
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller,Skinford Death Sentence 2023


# Creating tfidf matrix

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =  TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

# Search function


In [8]:

from sklearn.metrics.pairwise import cosine_similarity
#import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()  #compare the queries and return similarity
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
#testing
title = "Skinford: Death Sentence"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices].iloc[::-1]
results

Unnamed: 0,movieId,title,genres,clean_title
31395,137974,Death Sentence (1974),Crime|Drama|Mystery|Thriller,Death Sentence 1974
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller,Skinford Death Sentence 2023
84555,282317,Death Sentence (1968),Western,Death Sentence 1968
11793,54787,Death Sentence (2007),Drama|Thriller,Death Sentence 2007
29280,133237,The Last Sentence (2012),Drama,The Last Sentence 2012


# Search Box

In [10]:
#pip install ipywidgets


In [11]:
# Create the input text box and output widget
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_output = widgets.Output()


def on_type(data):
    with movie_output:
        clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_output.clear_output()  
            display(results)


movie_input.observe(on_type, names='value')
display(movie_input, movie_output)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
# Define the movie_id you want to find similar users for
movie_id = 1  # For example, Toy Story

# Find similar users
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)


In [13]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.083155
318,0.424204,0.256839
260,0.385136,0.152899
356,0.357989,0.166611
296,0.345989,0.192767
...,...,...
284297,0.000039,0.000003
282967,0.000039,0.000003
282727,0.000039,0.000007
279054,0.000039,0.000010


In [14]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
81534,0.000118,1e-05,12.02577,273747,Full Time (2022),Drama,Full Time 2022
74453,0.000118,1e-05,12.02577,243402,A Fairly Odd Summer (2014),Children|Comedy|Fantasy,A Fairly Odd Summer 2014
10381,0.000118,1e-05,12.02577,40404,Al otro lado (2004),Drama,Al otro lado 2004
78805,0.000118,1e-05,12.02577,263283,15 Minutes of Shame (2021),Documentary,15 Minutes of Shame 2021
24958,0.000118,1e-05,12.02577,122577,The Devil's Brother (1933),Comedy,The Devils Brother 1933
69860,0.000118,1e-05,12.02577,224743,All In: The Fight for Democracy (2020),Documentary,All In The Fight for Democracy 2020
37583,0.000118,1e-05,12.02577,152175,Ghosts (1997),Horror,Ghosts 1997
41063,0.000118,1e-05,12.02577,159944,The Dresser (2015),Drama,The Dresser 2015
47739,0.000118,1e-05,12.02577,173933,The Son of Bigfoot (2017),Animation|Children,The Son of Bigfoot 2017
13230,0.000118,1e-05,12.02577,67929,Sanshiro Sugata (Judo Saga) (Sugata Sanshirô) ...,Action|Adventure|Drama,Sanshiro Sugata Judo Saga Sugata Sanshir 1943


In [15]:
def find_similar_movies(movie_id):
    similar_movies = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_movies)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_movies)
    return similar_user_recs

In [16]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

# Recommendation Engine

In [25]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            # Calculate a dummy score based on 'movieId' and 'title'
            results["score"] = results["movieId"] % 100  # Just an example calculation
            display(results[['score', 'movieId', 'title', 'genres']])

movie_input.observe(on_type, names='value')

display(movie_input, recommendation_list)




Text(value='Toy Story', description='Movie Title:')

Output()