In [3]:
import pandas as pd
import numpy as np
import re
import os

print(os.getcwd()) #current os directory so we know where to read files from

c:\Users\admin\Desktop\Movie\notebooks


In [4]:
movies = pd.read_csv('../data/movies.csv') # to read movies csv file from notebooks directory

In [5]:
def clean_title(title):
    return re.sub(r'[^a-zA-Z0-9 ]', '', title) #clean title function to remove special characters
    

In [6]:
movies["clean_title"] = movies["title"].apply(clean_title) #create a new column with cleaned titles

In [7]:
movies.head() #display top 5 rows of the dataframe

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [8]:
# SK LEARN PART -------------------------------------------------------------- 

In [9]:
#TF-IDF - Iverse Document Frequency : Way to Convert text data into numbers so that the computer can understand it.
#Basically words that appear often in one title but not in many other titles get higher weights.
#Common words like the movie get lower importance

from sklearn.feature_extraction.text import TfidfVectorizer # this impoprt converts text into numeric form

vectorizer = TfidfVectorizer(ngram_range=(1,2)) #tells the vectorizer to use both single words and pairs or words as features, this is an object class with diffrent methods inside it

tfidf = vectorizer.fit_transform(movies['clean_title']) #learns all vocab of all movie titles and covert them into numbeic TD-IDF values

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies.iloc[indices].copy()
    
    # Add rating counts
    movie_ratings_count = ratings.groupby('movieId').size().reset_index(name='rating_count')
    results = results.merge(movie_ratings_count, on='movieId', how='left')
    results['rating_count'] = results['rating_count'].fillna(0)
    
    # Add similarity scores
    results['similarity'] = similarity[indices]
    
    # Filter out movies with very few ratings (less than 50)
    results = results[results['rating_count'] >= 50]
    
    # Sort by similarity first, then rating count
    results = results.sort_values(['similarity', 'rating_count'], ascending=[False, False])
    
    return results.head(5)

In [11]:
# Interactive Widget --------------------------------------------------------------

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False

)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output() # Clear previous output
        title = data['new']  # Get the current value of the text input
        if len(title) > 3:
            display(search(title))  # Display search results if title length > 5

movie_input.observe(on_type, names='value') #  Whenever the movie input widget value changes, ontype is called

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
#Rating Widget --------------------------------------------------------------

ratings = pd.read_csv('../data/ratings.csv') # read ratings csv file from notebooks directory

In [13]:
movie_id = 1

similar_users = ratings[(ratings['movieId'] == movie_id)  & (ratings['rating'] >= 5)]["userId"].unique()
similar_users # set of people who liked the same movie as us and get their user ids

array([    36,     75,     86, ..., 162518, 162519, 162530],
      shape=(13506,))

In [14]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]
# Filter the ratings data to include only ratings from similar users and get their recommendations if they rated it 4 or higher

In [15]:
similar_user_recs

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998388,162530,3706,5.0,989809041
24998389,162530,3735,5.0,989808150
24998391,162530,3763,5.0,989809659
24998392,162530,4187,5.0,989809274


In [16]:
# Look for Movies that only 10 percent or more have liked
similar_user_recs = similar_user_recs['movieId'].value_counts() / len(similar_users)
popular_movies = similar_user_recs[similar_user_recs > 0.1]
similar_user_recs

movieId
1         1.000000
318       0.414556
260       0.404561
356       0.347253
296       0.342663
            ...   
4152      0.000074
26985     0.000074
58870     0.000074
176349    0.000074
178311    0.000074
Name: count, Length: 16797, dtype: float64

In [17]:
popular_movies # Movies that have ratings from more than 10 percent of similar users

movieId
1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: count, Length: 92, dtype: float64

In [18]:
all_users = ratings[(ratings["movieId"].isin(popular_movies.index)) & (ratings['rating'] > 4)]
#all users that have rated the popular movies highly

In [19]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000055,162541,4973,4.5,1240950790
25000057,162541,4993,5.0,1240952610
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


In [20]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
#finding the percentage of all users who rated each movie highly

In [21]:
all_users_recs

movieId
318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: count, Length: 92, dtype: float64

In [22]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1, join= 'inner')
rec_percentages.columns = ["similar", "all"]

In [23]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [24]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]


In [25]:
rec_percentages = rec_percentages.sort_values("score", ascending= False)

In [26]:
rec_percentages #ratio of similar users who liked the movie to all users who liked the movie

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [27]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


In [28]:
#FINAL Recommendation Function
def find_similar_movies(movie_id):

    similar_users = ratings[(ratings['movieId'] == movie_id)  & (ratings['rating'] > 4 )]["userId"].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]
# Filter the ratings data to include only ratings from similar users and get their recommendations if they rated it 4 or higher 
    similar_user_recs = similar_user_recs['movieId'].value_counts() / len(similar_users)
    popular_movies = similar_user_recs[similar_user_recs > 0.05]
#Creating our score column
    all_users = ratings[(ratings["movieId"].isin(popular_movies.index)) & (ratings['rating'] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1, join= 'inner')
    rec_percentages.columns = ["similar", "all"]    

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending= False)

    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

In [29]:
#Interactive Widget for Final Recommendation Function
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

# First, check what the search function is returning
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            print(f"Search results for '{title}':")
            
            if len(results) > 0:
                movie_id = results.iloc[0]["movieId"]
                print(f"\nUsing movieId: {movie_id}")
                recs = find_similar_movies(movie_id)
                display(recs)

movie_input_name.observe(on_type, names='value') #  Whenever the movie input widget value changes, ontype is called

In [30]:
display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()