In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import pickle

In [2]:
movies = pd.read_csv('movie_data_with_urls.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres,Poster URL
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://image.tmdb.org/t/p/original/qJU6rfil5x...
4,5,Father of the Bride Part II (1995),Comedy,https://image.tmdb.org/t/p/original/rj4LBtwQ0u...


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movies.isna().sum()

movieId       0
title         0
genres        0
Poster URL    0
dtype: int64

In [6]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Data Preprocessing

In [7]:
# covert movie genres to list 
movList = list(movies['genres'])
sp_list = []

for mov in movList:
    sp_list.append(mov.split('|'))

movies['genres'] = sp_list
movies

Unnamed: 0,movieId,title,genres,Poster URL
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",https://image.tmdb.org/t/p/original/qJU6rfil5x...
4,5,Father of the Bride Part II (1995),[Comedy],https://image.tmdb.org/t/p/original/rj4LBtwQ0u...
...,...,...,...,...
9747,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",https://image.tmdb.org/t/p/original/4jU2Bdk1MB...
9748,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",https://image.tmdb.org/t/p/original/cCBB6BGRj5...
9749,193585,Flint (2017),[Drama],https://image.tmdb.org/t/p/original/iPzkjNWpK7...
9750,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",https://image.tmdb.org/t/p/original/vy8dW9kDuX...


In [8]:
# converting movie title and genres to lowercase

movies['genres'] = movies['genres'].apply(lambda genre_list: [genre.lower() for genre in genre_list])
movies['title'] = movies['title'].str.lower()
movies

Unnamed: 0,movieId,title,genres,Poster URL
0,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,2,jumanji (1995),"[adventure, children, fantasy]",https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,3,grumpier old men (1995),"[comedy, romance]",https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,4,waiting to exhale (1995),"[comedy, drama, romance]",https://image.tmdb.org/t/p/original/qJU6rfil5x...
4,5,father of the bride part ii (1995),[comedy],https://image.tmdb.org/t/p/original/rj4LBtwQ0u...
...,...,...,...,...
9747,193581,black butler: book of the atlantic (2017),"[action, animation, comedy, fantasy]",https://image.tmdb.org/t/p/original/4jU2Bdk1MB...
9748,193583,no game no life: zero (2017),"[animation, comedy, fantasy]",https://image.tmdb.org/t/p/original/cCBB6BGRj5...
9749,193585,flint (2017),[drama],https://image.tmdb.org/t/p/original/iPzkjNWpK7...
9750,193587,bungo stray dogs: dead apple (2018),"[action, animation]",https://image.tmdb.org/t/p/original/vy8dW9kDuX...


In [9]:
display(movies.head(), ratings.head())

Unnamed: 0,movieId,title,genres,Poster URL
0,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,2,jumanji (1995),"[adventure, children, fantasy]",https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,3,grumpier old men (1995),"[comedy, romance]",https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,4,waiting to exhale (1995),"[comedy, drama, romance]",https://image.tmdb.org/t/p/original/qJU6rfil5x...
4,5,father of the bride part ii (1995),[comedy],https://image.tmdb.org/t/p/original/rj4LBtwQ0u...


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [10]:
# merging the movies and ratings datasets and remove the timestamp column.
mov_ratings = movies.merge(ratings, how='inner', on='movieId')
mov_ratings.drop('timestamp', axis=1, inplace=True)
mov_ratings.head()

Unnamed: 0,movieId,title,genres,Poster URL,userId,rating
0,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,7,3.0
1,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,9,4.0
2,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,13,5.0
3,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,15,2.0
4,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,19,3.0


### Get Top Movies

In [11]:
# Creates a new DataFrame by removing the userId, movieId, and genres columns from the mov_ratings
df_getTopMov = mov_ratings.drop(['userId', 'movieId', 'genres'], axis=1)
# Removes any duplicate rows from the df_getTopMov DataFrame.
df_getTopMov.drop_duplicates(inplace=True)
df_getTopMov

Unnamed: 0,title,Poster URL,rating
0,toy story (1995),https://image.tmdb.org/t/p/original/uXDfjJbdP4...,3.0
1,toy story (1995),https://image.tmdb.org/t/p/original/uXDfjJbdP4...,4.0
2,toy story (1995),https://image.tmdb.org/t/p/original/uXDfjJbdP4...,5.0
3,toy story (1995),https://image.tmdb.org/t/p/original/uXDfjJbdP4...,2.0
5,toy story (1995),https://image.tmdb.org/t/p/original/uXDfjJbdP4...,3.5
...,...,...,...
97339,mike & dave need wedding dates (2016),https://image.tmdb.org/t/p/original/rK0UwpiE3P...,4.0
97340,piper (2016),https://image.tmdb.org/t/p/original/rfEkkVzmrM...,4.0
97341,kingsglaive: final fantasy xv (2016),https://image.tmdb.org/t/p/original/xGMgspge6u...,3.0
97342,body (2015),https://image.tmdb.org/t/p/original/dw0xE56whf...,1.0


In [12]:
# calculates the number of ratings each movie has received and stores the result in a new DataFrame
num_rating_df = df_getTopMov.groupby('title').count()['rating'].reset_index()
num_rating_df

Unnamed: 0,title,rating
0,"'burbs, the (1989)",7
1,'hellboy': the seeds of creation (2004),1
2,'night mother (1986),1
3,'round midnight (1986),2
4,'salem's lot (2004),1
...,...,...
7057,zootopia (2016),5
7058,zulu (1964),1
7059,zulu (2013),1
7060,¡three amigos! (1986),8


In [13]:
# calculates the average rating for each movie
avg_rating_df = df_getTopMov.groupby('title')['rating'].mean().reset_index()
# renames the result to avg_rating
avg_rating_df.rename(columns={'rating':'avg_rating'},inplace=True)
avg_rating_df

Unnamed: 0,title,avg_rating
0,"'burbs, the (1989)",3.0000
1,'hellboy': the seeds of creation (2004),2.0000
2,'night mother (1986),5.0000
3,'round midnight (1986),2.2500
4,'salem's lot (2004),3.5000
...,...,...
7057,zootopia (2016),3.6000
7058,zulu (1964),4.0000
7059,zulu (2013),1.5000
7060,¡three amigos! (1986),3.0625


In [14]:
# merge num_rating_df and avg_rating_df
popular_df = num_rating_df.merge(avg_rating_df,on='title')
popular_df

Unnamed: 0,title,rating,avg_rating
0,"'burbs, the (1989)",7,3.0000
1,'hellboy': the seeds of creation (2004),1,2.0000
2,'night mother (1986),1,5.0000
3,'round midnight (1986),2,2.2500
4,'salem's lot (2004),1,3.5000
...,...,...,...
7057,zootopia (2016),5,3.6000
7058,zulu (1964),1,4.0000
7059,zulu (2013),1,1.5000
7060,¡three amigos! (1986),8,3.0625


In [15]:
# filters and sorts the popular_df DataFrame based on the number of ratings and average ratings
popular_df = popular_df[popular_df['rating'] > 0].sort_values('avg_rating', ascending=False)
popular_df.size

21186

In [16]:
# merges the popular_df DataFrame with the movies DataFrame to include the poster URLs for each movie
popular_df = popular_df.merge(movies,on='title').drop_duplicates('title')[['title', 'Poster URL', 'avg_rating']]
popular_df

Unnamed: 0,title,Poster URL,avg_rating
0,and the ship sails on (e la nave va) (1983),https://image.tmdb.org/t/p/original/kAJEaXgZ5s...,5.0
1,"trip, the (2002)",https://image.tmdb.org/t/p/original/h8Pfn0FFCB...,5.0
2,something borrowed (2011),https://image.tmdb.org/t/p/original/ePBBKnOkxs...,5.0
3,maelström (2000),https://image.tmdb.org/t/p/original/knhWhSDOEP...,5.0
4,dr. jekyll and mr. hyde (1941),https://image.tmdb.org/t/p/original/mZtQORzQWk...,5.0
...,...,...,...
7069,"angel at my table, an (1990)",https://image.tmdb.org/t/p/original/sDICHxQRDk...,0.5
7070,daddy day camp (2007),https://image.tmdb.org/t/p/original/fhuhSVb3j8...,0.5
7071,arthur 2: on the rocks (1988),https://image.tmdb.org/t/p/original/oHrdTClgUy...,0.5
7072,pokemon 4 ever (a.k.a. pokémon 4: the movie) (...,Not found,0.5


In [17]:
# saves the popular_df DataFrame to a file using the pickle module
with open('../PKL_Files/popular_movies_df', 'wb') as file:
    pickle.dump(popular_df, file)

In [18]:
popular_df.head()

Unnamed: 0,title,Poster URL,avg_rating
0,and the ship sails on (e la nave va) (1983),https://image.tmdb.org/t/p/original/kAJEaXgZ5s...,5.0
1,"trip, the (2002)",https://image.tmdb.org/t/p/original/h8Pfn0FFCB...,5.0
2,something borrowed (2011),https://image.tmdb.org/t/p/original/ePBBKnOkxs...,5.0
3,maelström (2000),https://image.tmdb.org/t/p/original/knhWhSDOEP...,5.0
4,dr. jekyll and mr. hyde (1941),https://image.tmdb.org/t/p/original/mZtQORzQWk...,5.0


In [19]:
popular_year_df = popular_df.copy()

In [20]:
# Function to extract year from title
def extract_year(title):
    return title.split('(')[-1].replace(')', '')

# Add a 'year' column to the DataFrame
popular_year_df['year'] = popular_year_df['title'].apply(extract_year)

In [21]:
popular_year_df.head()

Unnamed: 0,title,Poster URL,avg_rating,year
0,and the ship sails on (e la nave va) (1983),https://image.tmdb.org/t/p/original/kAJEaXgZ5s...,5.0,1983
1,"trip, the (2002)",https://image.tmdb.org/t/p/original/h8Pfn0FFCB...,5.0,2002
2,something borrowed (2011),https://image.tmdb.org/t/p/original/ePBBKnOkxs...,5.0,2011
3,maelström (2000),https://image.tmdb.org/t/p/original/knhWhSDOEP...,5.0,2000
4,dr. jekyll and mr. hyde (1941),https://image.tmdb.org/t/p/original/mZtQORzQWk...,5.0,1941


In [22]:
# saves the popular_df by year DataFrame to a file using the pickle module
with open('../PKL_Files/popular_movies_year_df', 'wb') as file:
    pickle.dump(popular_year_df, file)

### Content Based Filtering

In [23]:
mov_ratings[mov_ratings['title'] == 'toy story (1995)']

Unnamed: 0,movieId,title,genres,Poster URL,userId,rating
0,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,7,3.0
1,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,9,4.0
2,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,13,5.0
3,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,15,2.0
4,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,19,3.0
...,...,...,...,...,...,...
242,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,660,2.5
243,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,663,4.0
244,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,664,3.5
245,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,670,4.0


In [24]:
# creating new df
new_df = mov_ratings[['movieId', 'title', 'genres', 'Poster URL']]
new_df

Unnamed: 0,movieId,title,genres,Poster URL
0,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
2,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
3,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
4,1,toy story (1995),"[adventure, animation, children, comedy, fantasy]",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
...,...,...,...,...
97339,160567,mike & dave need wedding dates (2016),[comedy],https://image.tmdb.org/t/p/original/rK0UwpiE3P...
97340,160718,piper (2016),[animation],https://image.tmdb.org/t/p/original/rfEkkVzmrM...
97341,161594,kingsglaive: final fantasy xv (2016),"[action, adventure, animation, drama, fantasy,...",https://image.tmdb.org/t/p/original/xGMgspge6u...
97342,161830,body (2015),"[drama, horror, thriller]",https://image.tmdb.org/t/p/original/dw0xE56whf...


In [25]:
new_df['genres'] = new_df['genres'].apply(lambda x: ' '.join(x))
new_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['genres'] = new_df['genres'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.drop_duplicates(inplace=True)


In [26]:
new_df.shape

(7063, 4)

In [27]:
new_df = new_df.reset_index()
new_df.head()

Unnamed: 0,index,movieId,title,genres,Poster URL
0,0,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,247,2,jumanji (1995),adventure children fantasy,https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,354,3,grumpier old men (1995),comedy romance,https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,413,4,waiting to exhale (1995),comedy drama romance,https://image.tmdb.org/t/p/original/qJU6rfil5x...
4,426,5,father of the bride part ii (1995),comedy,https://image.tmdb.org/t/p/original/rj4LBtwQ0u...


In [28]:
# To transform given text into a vector on the basis of frequency count
from sklearn.feature_extraction.text import CountVectorizer # convert a collection of text documents into a matrix of token counts
cv = CountVectorizer(max_features=23, stop_words='english')

In [29]:
vectors = cv.fit_transform(new_df['genres']).toarray()
vectors

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
# each column represents a unique word and 7279 are the no of documents.
# this represents the count of every tokenized genre word in each document 
vectors.shape

(7063, 23)

In [31]:
from nltk.stem.porter import PorterStemmer #stemming words in text data
pt = PorterStemmer()

In [32]:
# function to stem all the words in the genres column
def stem(text):
    y = []
    for i in text.split():
        y.append(pt.stem(i))
    
    return " ".join(y)

In [33]:
new_df['genres'] = new_df['genres'].apply(stem)
new_df['genres'][0]

'adventur anim children comedi fantasi'

In [34]:
# Save the array to a pickle file
with open('../PKL_Files/stemmed_df_content_based', 'wb') as file:
    pickle.dump(new_df, file)

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [36]:
# Save the array to a pickle file
with open('../PKL_Files/similarity_content_based', 'wb') as file:
    pickle.dump(similarity, file)

In [37]:
def content_based_recommand(movie):
    # load files
    # Load the array from the pickle file
    with open('../PKL_Files/stemmed_df_content_based', 'rb') as file:
        new_df = pickle.load(file)

        # Load the array from the pickle file
    with open('../PKL_Files/similarity_content_based', 'rb') as file:
        similarity = pickle.load(file)

    mov_list = []
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movie_list:
        d = dict()
        d['title'] = new_df.iloc[i[0]].title
        d['url'] = new_df.iloc[i[0]]['Poster URL']
        mov_list.append(d)
    
    return mov_list

In [38]:
recommended_mov_list = content_based_recommand('toy story (1995)')
for m in recommended_mov_list:
    print(m)

{'title': 'antz (1998)', 'url': 'https://image.tmdb.org/t/p/original/lWPjxbUMpAHFkJpZHHNWhQaRsax.jpg'}
{'title': 'toy story 2 (1999)', 'url': 'https://image.tmdb.org/t/p/original/2MFIhZAW0CVlEQrFyqwa4U6zqJP.jpg'}
{'title': 'adventures of rocky and bullwinkle, the (2000)', 'url': 'https://image.tmdb.org/t/p/original/xCFSsftt2rglC81I6QLWcZSTCBM.jpg'}
{'title': "emperor's new groove, the (2000)", 'url': 'https://image.tmdb.org/t/p/original/wwbgkXQBEKtnyIJapk6gUgWkVw8.jpg'}
{'title': 'monsters, inc. (2001)', 'url': 'https://image.tmdb.org/t/p/original/wFSpyMsp7H0ttERbxY7Trlv8xry.jpg'}


In [84]:



def find_actual_relevant_movies_by_genre(movie_name, df):
    
    # Get the genres of the specified movie
    movie_genres = df[df['title'] == movie_name]['genres'].values[0]
    
    # Find movies that share at least one genre with the specified movie
    relevant_movies = df[df['genres'].apply(lambda x: any(genre in x for genre in movie_genres))]

    # Exclude the original movie
    relevant_movies = relevant_movies[relevant_movies['title'] != movie_name]

    return relevant_movies['title'].tolist()

with open('../PKL_Files/stemmed_df_content_based', 'rb') as file:
    new_df = pickle.load(file)

movie_name = 'zulu (1964)'  # The movie you want to find relevant movies for
actual_relevant_movies = find_actual_relevant_movies_by_genre(movie_name, new_df)

print(f"Actual relevant movies for '{movie_name}': {actual_relevant_movies}")


Actual relevant movies for 'zulu (1964)': ['toy story (1995)', 'jumanji (1995)', 'grumpier old men (1995)', 'waiting to exhale (1995)', 'father of the bride part ii (1995)', 'heat (1995)', 'sabrina (1995)', 'tom and huck (1995)', 'sudden death (1995)', 'goldeneye (1995)', 'american president, the (1995)', 'dracula: dead and loving it (1995)', 'balto (1995)', 'nixon (1995)', 'cutthroat island (1995)', 'casino (1995)', 'sense and sensibility (1995)', 'four rooms (1995)', 'ace ventura: when nature calls (1995)', 'money train (1995)', 'get shorty (1995)', 'copycat (1995)', 'assassins (1995)', 'powder (1995)', 'leaving las vegas (1995)', 'othello (1995)', 'now and then (1995)', 'persuasion (1995)', 'city of lost children, the (cité des enfants perdus, la) (1995)', 'shanghai triad (yao a yao yao dao waipo qiao) (1995)', 'dangerous minds (1995)', 'twelve monkeys (a.k.a. 12 monkeys) (1995)', 'babe (1995)', 'dead man walking (1995)', 'it takes two (1995)', 'clueless (1995)', 'cry, the beloved c

In [90]:
def calculate_recall(recommended_mov_list, actual_relevant_movies):
    recommended_titles = [item['title'] for item in recommended_mov_list] if isinstance(recommended_mov_list[0], dict) else recommended_mov_list
    
    
    recommended_set = set(recommended_titles)
    relevant_set = set(actual_relevant_movies)
    
    hits = len(recommended_set & relevant_set)  
    
    # Calculate recall
    recall = hits / len(relevant_set) if relevant_set else 0  
    
    return recall

recall_value = calculate_recall(recommended_mov_list, actual_relevant_movies)
print(recall_value)


0.000708114997875655


In [91]:
def calculate_precision(recommended_mov_list, actual_relevant_movies):
    recommended_titles = [item['title'] for item in recommended_mov_list] if isinstance(recommended_mov_list[0], dict) else recommended_mov_list
    
    # Convert recommended titles to set for easier calculation
    recommended_set = set(recommended_titles)
    
    # Find the number of relevant items recommended (intersection)
    hits = len(recommended_set & set(actual_relevant_movies))  
    precision = hits / len(recommended_set) if recommended_set else 0  
    return precision


# Calculate Precision
precision_value = calculate_precision(recommended_mov_list, actual_relevant_movies)

print(f"Precision: {precision_value:.2f}")


Precision: 1.00


In [101]:
def f1_score(precision, recall):
    if precision + recall == 0:  
        return 0
    return 2 * (precision * recall) / (precision + recall)
f1 = f1_score(precision_value, recall_value)
print(f"F1 Score: {f1:.1f}")


F1 Score: 0.0


### Colaborative filtering

In [39]:
mov_ratings[mov_ratings['title'] == 'nothing in common (1986)']

Unnamed: 0,movieId,title,genres,Poster URL,userId,rating
49851,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,30,3.0
49852,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,306,3.0
49853,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,358,4.0
49854,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,518,3.0
49855,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,529,4.0
49856,2418,nothing in common (1986),[comedy],https://image.tmdb.org/t/p/original/m2G6Idl1CK...,564,4.0


In [40]:
# remove square brackets from genres column
pd.set_option('display.max_colwidth', None)

mov_ratings['genres'] = mov_ratings['genres'].apply(lambda x: ' '.join(x))
mov_ratings.drop_duplicates(inplace=True)

In [41]:
mov_ratings.head()

Unnamed: 0,movieId,title,genres,Poster URL,userId,rating
0,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,7,3.0
1,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,9,4.0
2,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,13,5.0
3,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,15,2.0
4,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,19,3.0


In [76]:
# Assuming mov_ratings DataFrame exists and 'genres' column contains multiple genres separated by commas

# Split the genres and flatten the list
unique_genres = mov_ratings['genres'].str.split(',').explode().str.strip()

# Count the unique genres
num_unique_genres = unique_genres.nunique()

# Print the result
print(f"Number of unique genres: {num_unique_genres}")

# Optionally, you can also display the unique genres
print("Unique genres are:", unique_genres.unique())


Number of unique genres: 819
Unique genres are: ['adventure animation children comedy fantasy'
 'adventure children fantasy' 'comedy romance' 'comedy drama romance'
 'comedy' 'action crime thriller' 'adventure children' 'action'
 'action adventure thriller' 'comedy horror'
 'adventure animation children' 'drama' 'action adventure romance'
 'crime drama' 'drama romance' 'action comedy crime drama thriller'
 'comedy crime thriller' 'crime drama horror mystery thriller'
 'drama sci-fi' 'children drama' 'adventure drama fantasy mystery sci-fi'
 'mystery sci-fi thriller' 'children comedy' 'drama war'
 'action crime drama' 'action adventure fantasy' 'comedy drama thriller'
 'mystery thriller' 'animation children drama musical romance'
 'crime mystery thriller' 'adventure drama' 'drama thriller'
 'comedy crime' 'action sci-fi thriller' 'action comedy horror thriller'
 'comedy drama' 'documentary' 'action crime drama thriller'
 'crime drama romance' 'action adventure drama' 'action thriller'
 

In [42]:
# Save the array to a pickle file
with open('../PKL_Files/movie_rating_collaborative', 'wb') as file:
    pickle.dump(mov_ratings, file)

In [43]:
# users who have given more than 100 ratings are considered
x = mov_ratings.groupby('userId').count()['rating'] > 100
users = x[x].index
users

Index([  4,   8,  15,  17,  19,  21,  22,  23,  26,  30,
       ...
       647, 648, 652, 654, 655, 656, 659, 664, 665, 671],
      dtype='int64', name='userId', length=257)

In [44]:
#new DataFrame to retain only the rows corresponding to users who have rated 100 movies
filtered_rating = mov_ratings[mov_ratings['userId'].isin(users)]
filtered_rating

Unnamed: 0,movieId,title,genres,Poster URL,userId,rating
3,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,15,2.0
4,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,19,3.0
6,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,23,3.0
7,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,26,5.0
8,1,toy story (1995),adventure animation children comedy fantasy,https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,30,4.0
...,...,...,...,...,...,...
97339,160567,mike & dave need wedding dates (2016),comedy,https://image.tmdb.org/t/p/original/rK0UwpiE3PSdGahfDZLCummxMwd.jpg,15,4.0
97340,160718,piper (2016),animation,https://image.tmdb.org/t/p/original/rfEkkVzmrMYqGezNLl02mVyJCP2.jpg,547,4.0
97341,161594,kingsglaive: final fantasy xv (2016),action adventure animation drama fantasy sci-fi,https://image.tmdb.org/t/p/original/xGMgspge6uYoypRiasAmlluDlXH.jpg,73,3.0
97342,161830,body (2015),drama horror thriller,https://image.tmdb.org/t/p/original/dw0xE56whflv0OgfEw3lwdWxLRD.jpg,624,1.0


In [45]:
# selecting only the movies that have received more than 50 ratings 
y = filtered_rating.groupby('title').count() > 50
famous_movies = y[y].index
famous_movies

Index([''burbs, the (1989)', ''hellboy': the seeds of creation (2004)',
       ''night mother (1986)', ''round midnight (1986)', ''salem's lot (2004)',
       ''til there was you (1997)', '(500) days of summer (2009)',
       '*batteries not included (1987)', '...and justice for all (1979)',
       '1-900 (06) (1994)',
       ...
       'zombeavers (2014)',
       'zombie (a.k.a. zombie 2: the dead are among us) (zombi 2) (1979)',
       'zombieland (2009)', 'zoolander (2001)', 'zoom (2006)',
       'zootopia (2016)', 'zulu (1964)', 'zulu (2013)',
       '¡three amigos! (1986)', 'à nous la liberté (freedom for us) (1931)'],
      dtype='object', name='title', length=6931)

In [46]:
# filters the filtered_rating DataFrame to retain only the ratings for movies that have been identified as "famous"
final_ratings = filtered_rating[filtered_rating['title'].isin(famous_movies)]

In [47]:
#organize the data in a matrix format Each row corresponds to a movie title / Each column corresponds to a userID / The values are the ratings
pt = final_ratings.pivot_table(index='title',columns='userId',values='rating')
pt.fillna(0,inplace=True)
pt

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, the (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
'hellboy': the seeds of creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'round midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'salem's lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zootopia (2016),0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zulu (1964),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zulu (2013),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡three amigos! (1986),0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# Save the array to a pickle file
with open('../PKL_Files/pivot_table_collaborative', 'wb') as file:
    pickle.dump(pt, file)

In [49]:
pt.columns


Index([  4,   8,  15,  17,  19,  21,  22,  23,  26,  30,
       ...
       647, 648, 652, 654, 655, 656, 659, 664, 665, 671],
      dtype='int64', name='userId', length=257)

In [50]:
# calculates the cosine similarity between the movies in the pivot table
similarity_scores = cosine_similarity(pt)
similarity_scores

array([[1.        , 0.248708  , 0.23931939, ..., 0.        , 0.43275917,
        0.        ],
       [0.248708  , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.23931939, 0.        , 1.        , ..., 0.        , 0.2445998 ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.18156826,
        0.        ],
       [0.43275917, 0.        , 0.2445998 , ..., 0.18156826, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [51]:
# Save the array to a pickle file
with open('../PKL_Files/similarity_scores_collaborative', 'wb') as file:
    pickle.dump(similarity_scores, file)

In [52]:
def collaborative_recommend(movie_name):
    # Load files
    with open('../PKL_Files/movie_rating_collaborative', 'rb') as file:
        mov_ratings = pickle.load(file)

    with open('../PKL_Files/similarity_scores_collaborative', 'rb') as file:
        similarity_scores = pickle.load(file)

    with open('../PKL_Files/pivot_table_collaborative', 'rb') as file:
        pt = pickle.load(file)

    # index fetch
    index = np.where(pt.index==movie_name)[0][0]

    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]
    # simillar items from 1 to 4
    
    data = []
    for i in similar_items:
        item = []
        d = dict()
        temp_df = mov_ratings[mov_ratings['title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('title')['title'].values))

        d['title'] = item[0]
        d['url'] = temp_df['Poster URL'].values.tolist()[0]
        data.append(d)
    
    return data

In [53]:
collaborative_recommend('zulu (1964)')

[{'title': 'art school confidential (2006)',
  'url': 'https://image.tmdb.org/t/p/original/eV6UL22Zlpsq5XybUupWh0rBOp0.jpg'},
 {'title': "blind spot: hitler's secretary (im toten winkel - hitlers sekretärin) (2002)",
  'url': 'https://image.tmdb.org/t/p/original/4FyWb8uQsC2qj8o7rMiJgicZvy0.jpg'},
 {'title': 'cinemania (2002)',
  'url': 'https://image.tmdb.org/t/p/original/yDxDCCxuyDx6MrJkLqZwcvPWamq.jpg'},
 {'title': 'dark blue (2003)',
  'url': 'https://image.tmdb.org/t/p/original/9RTiJx3oD1s3RAMtjSo7ThZmcxw.jpg'}]

In [81]:
def got_relevant_movies(movie_name, mov_ratings, similarity_scores, k):
    
    # Find index of the movie in the similarity score matrix
    index = np.where(mov_ratings['title'] == movie_name)[0][0]
    
    # Get top-k similar movies (excluding the first item which is the movie itself)
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:k+1]
    
    relevant_movies = [mov_ratings.iloc[i[0]]['title'] for i in similar_items]
    return relevant_movies

In [82]:
# Example usage
movie_name = 'zulu (1964)'  # Replace with the actual movie name
relevant_movies = got_relevant_movies(movie_name, mov_ratings, similarity_scores, 5)
print(f"Relevant Movies for '{movie_name}': {relevant_movies}")

IndexError: index 75458 is out of bounds for axis 0 with size 6931

In [54]:
def collaborative_recommend_for_user(user_id, top_n=5):
    # Load files
    with open('../PKL_Files/movie_rating_collaborative', 'rb') as file:
        mov_ratings = pickle.load(file)

    with open('../PKL_Files/similarity_scores_collaborative', 'rb') as file:
        similarity_scores = pickle.load(file)

    with open('../PKL_Files/pivot_table_collaborative', 'rb') as file:
        pt = pickle.load(file)

    # Get the user's ratings from the user-movie rating matrix
    user_ratings = pt[user_id]

    # Create an empty list to store recommended movies
    recommended_movies = []

    for movie_index, similarity in enumerate(similarity_scores):
        item = []

        # Skip movies the user has already rated
        if user_ratings[movie_index] > 0:
            continue

        temp_df = mov_ratings[mov_ratings['title'] == pt.index[movie_index]]
        item.extend(list(temp_df.drop_duplicates('title')['title'].values))

        d = dict()
        d['title'] = item[0]
        d['url'] = temp_df['Poster URL'].values.tolist()[0]

        recommended_movies.append(d)

    return recommended_movies[:5]

In [55]:
# Example usage:
user_id = 200
user_recommendations = collaborative_recommend_for_user(user_id)
user_recommendations

  if user_ratings[movie_index] > 0:


[{'title': "'burbs, the (1989)",
  'url': 'https://image.tmdb.org/t/p/original/vrVPAcv2njVdnkqhBwGBc7UxCjz.jpg'},
 {'title': "'hellboy': the seeds of creation (2004)",
  'url': 'https://image.tmdb.org/t/p/original/358FTzyn2TusjvdqoW0lLMr7KTY.jpg'},
 {'title': "'night mother (1986)",
  'url': 'https://image.tmdb.org/t/p/original/5khUZ1QWNUNd9Ryq56kRbDU1959.jpg'},
 {'title': "'round midnight (1986)",
  'url': 'https://image.tmdb.org/t/p/original/a731XTWiMHOwzxyKPNmlx1PZn5z.jpg'},
 {'title': "'salem's lot (2004)",
  'url': 'https://image.tmdb.org/t/p/original/j7ncdqBVufydVzVtxmXu8Ago4ox.jpg'}]

In [63]:
def get_relevant_items(user_id, ratings_df, threshold=4):
    
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    relevant_items = user_ratings[user_ratings['rating'] >= threshold]['title'].tolist()
    return relevant_items

In [64]:
user_id = 123  # Replace with the actual user ID
relevant_movies = get_relevant_items(user_id, mov_ratings)
print(f"Relevant Movies for User {user_id}: {relevant_movies}")


Relevant Movies for User 123: ['exotica (1994)', 'natural born killers (1994)', 'in the mouth of madness (1995)', 'kingpin (1996)', 'breakfast club, the (1985)', 'friday the 13th part 3: 3d (1982)', 'gremlins (1984)', 'faculty, the (1998)', 'office space (1999)', 'sixth sense, the (1999)', 'stir of echoes (1999)', 'fight club (1999)', 'bone collector, the (1999)', 'creepshow (1982)', 'dead calm (1989)', 'pacific heights (1990)', 'final destination (2000)', 'hollow man (2000)', 'heartbreakers (2001)', 'critters (1986)']


In [66]:
def recall_at_k(recommended_items, relevant_movies, k):
    recommended_k = relevant_movies[:k]
    hits = sum([1 for item in recommended_k if item in relevant_movies])
    return hits / len(relevant_movies)

# Example usage
recall = recall_at_k(user_recommendations, relevant_movies, 5)
print("Recall@5:", recall)


Recall@5: 0.25


In [94]:
def precision_at_k(recommended_items, relevant_movies, k):
    recommended_k = recommended_items[:k] 
    hits = sum([1 for item in recommended_k if item in relevant_movies])  
    return hits / len(recommended_k) if recommended_k else 0  



# Calculate Precision@K
precision = precision_at_k(user_recommendations, relevant_movies, 5)
print("Precision@5:", precision)

Precision@5: 0.0


In [95]:

def f1_score(precision, recall):
    if precision + recall == 0:  # Avoid division by zero
        return 0
    return 2 * (precision * recall) / (precision + recall)
f1 = f1_score(precision, recall)
print("F1 Score:", f1)

F1 Score: 0.0


#### Hybrid Approch

In [56]:
# considering the union
def get_hybrid_recommendations(user_id, movie, outputs):
    # Get content-based and collaborative-based recommendations
    content_based = content_based_recommand(movie) 
    collaborative_based = collaborative_recommend_for_user(user_id) 
    
    unique_movies = {}
    
    # Add content-based recommendations to the dictionary
    for item in content_based:
        unique_movies[item['title']] = item 
    
    # Add collaborative-based recommendations to the dictionary
    for item in collaborative_based:
        unique_movies[item['title']] = item 
    
    # Convert the dictionary values to a list
    hybrid_recommendations = list(unique_movies.values())
    
    # Return only the top 'outputs' number of recommendations
    return hybrid_recommendations[:outputs]

In [57]:
user_id = 200
movie = "zulu (1964)"
outputs = 5

recommendations  = get_hybrid_recommendations(user_id,movie,outputs)

print(f"Hybrid recommendations for user {user_id} based on {movie}")
recommendations

  if user_ratings[movie_index] > 0:


Hybrid recommendations for user 200 based on zulu (1964)


[{'title': 'apocalypse now (1979)',
  'url': 'https://image.tmdb.org/t/p/original/gQB8Y5RCMkv2zwzFHbUJX3kAhvA.jpg'},
 {'title': 'boot, das (boat, the) (1981)',
  'url': 'https://image.tmdb.org/t/p/original/hNfAYNCMRthp9iruF5Q6S1z4bVA.jpg'},
 {'title': 'all quiet on the western front (1930)',
  'url': 'https://image.tmdb.org/t/p/original/2IRjbi9cADuDMKmHdLK7LaqQDKA.jpg'},
 {'title': 'saving private ryan (1998)',
  'url': 'https://image.tmdb.org/t/p/original/uqx37cS8cpHg8U35f9U5IBlrCV3.jpg'},
 {'title': 'thin red line, the (1998)',
  'url': 'https://image.tmdb.org/t/p/original/seMydAaoxQP6F0xbE1jOcTmn5Jr.jpg'}]