# **Recommender Systems using Python**

# 01. Simple Recommenders

In [44]:
# Import Pandas
import pandas as pd #Pandas library to load and manipulate data

# Load Movies Metadata using the pd.read_csv() function and assigns it to a variable called metadata.
#low_memory=False parameter is used to Enables full file read into memory.
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


Calculate mean vote across the whole report (C)

In [45]:
#lets calculate C
C = metadata['vote_average'].mean() #code assigns the mean of the 'vote_average' column to the variable 'C'.
print(C)#prints the value of 'C'

6.104142554693013


calculates the minimum number of votes(m)

In [46]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)#quantile() function - calculate the 90th percentile of the vote_count column in the metadata dataframe
print(m) #print the value of M

383.0


Filtering out all qualified movies into a new DataFrame.

In [47]:
#creates a copy of the metadata DataFrame using the copy() method.
#loc method to select rows where the value in the vote_count column is greater than or equal to m.

q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
#The shape attribute is used to display the dimensions of the q_movies DataFrame.
q_movies.shape

(1421, 24)

simply returning the dimensions of the metadata object

In [48]:
#Python Tuple Structure Overview
#Tuples are immutable data structures in Python.
#They hold multiple values, similar to lists.
#In this case, they represent a data shape with 45466 rows and 24 columns.

metadata.shape

(14171, 24)

Calculating the value by applying this function to your DataFrame of qualified movies

In [49]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):  #function called weighted_rating that takes in a parameter x.function also uses two global variables m and C
# m - minimum number of votes
# C -  mean vote across the whole report
    v = x['vote_count'] # V - number of votes for the movie
    R = x['vote_average'] # R - average rating of the movie
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [50]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
#new feature called 'score' for a DataFrame called 'q_movies'
#value of this new feature is calculated using the 'weighted_rating()' function, which is applied to each row of the DataFrame using the 'apply()' method with the 'axis=1' parameter.
#The 'weighted_rating()' function likely takes into account various factors such as the popularity and rating of a movie to calculate a score that represents its overall quality.
#The resulting 'score' feature can be used to rank and compare movies in the DataFrame.
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

To print the top 15 movies

In [51]:
#This code sorts a DataFrame called q_movies based on a column called 'score' in descending order (highest to lowest).
#Then, it prints the top 20 movies from the sorted DataFrame, displaying their title, vote count, vote average, and score.
#The sort_values() method is used to sort the DataFrame based on the 'score' column.
#The ascending=False parameter is used to sort the values in descending order.
#The head() method is used to display the top 20 rows of the sorted DataFrame.
#The [['title', 'vote_count', 'vote_average', 'score']] syntax is used to select only the columns 'title', 'vote_count', 'vote_average', and 'score' from the DataFrame.

#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.395022
834,The Godfather,6024.0,8.5,8.35678
12481,The Dark Knight,12269.0,8.3,8.233527
2843,Fight Club,9678.0,8.3,8.216409
292,Pulp Fiction,8670.0,8.3,8.207101
522,Schindler's List,4436.0,8.3,8.12548
5481,Spirited Away,3968.0,8.3,8.106708
351,Forrest Gump,8147.0,8.2,8.105895
2211,Life Is Beautiful,3643.0,8.3,8.091104
1178,The Godfather: Part II,3418.0,8.3,8.078739


# 02. Content Based Recommender

In [52]:
#This code snippet is written in Python.
# The code is using the head() method to print the plot overviews of the first 5 movies in the metadata dataframe.
#The metadata dataframe is assumed to have a column named overview which contains the plot overviews of the movies.
#The head() method is used to display the first 5 rows of the overview column.
#This will give an idea of the plot of the first 5 movies in the dataframe.

#Print plot overviews of the first 5 movies.
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [53]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer #convert a collection of raw documents into a matrix of TF-IDF features.

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english') #This means that common English stop words such as 'the' and 'a' will be removed from the text data during vectorization.

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('') #The metadata dataframe is then loaded and any missing values in the 'overview' column are replaced with an empty string

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])#The TF-IDF matrix is then constructed by fitting and transforming the 'overview' column of the metadata dataframe using the TfidfVectorizer object.

#Output the shape of tfidf_matrix
tfidf_matrix.shape #the shape of the tfidf_matrix is outputted to the console to show the number of rows and columns in the matrix.

(14171, 38921)

In [54]:
#The AttributeError you're encountering suggests that the #TfidfVectorizer object does not have the get_feature_names method.
#In versions of scikit-learn 0.22 and later, this method has been #replaced by get_feature_names_out.
#To resolve this error, you should use the get_feature_names_out #method instead. Here's how you can modify your code:
#tfidf.get_feature_names()[5000:5010]

#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

array(['bungles', 'bungling', 'bunker', 'bunkmate', 'bunnies', 'bunny',
       'bunyan', 'bunz', 'buonarroti', 'buoy'], dtype=object)

In [55]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [56]:
cosine_sim.shape


(14171, 14171)

In [57]:
cosine_sim[1]


array([0.01669742, 1.        , 0.04884192, ..., 0.        , 0.        ,
       0.        ])

In [58]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()


In [59]:
indices[:10]


title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [60]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


In [61]:
get_recommendations('Toy Story')


2997                Toy Story 2
10301    The 40 Year Old Virgin
8327                  The Champ
1071      Rebel Without a Cause
11399    For Your Consideration
1932                  Condorman
3057            Man on the Moon
485                      Malice
11606              Factory Girl
5797              Class of 1984
Name: title, dtype: object

In [62]:
get_recommendations('Grumpier Old Men')


9207    An Extremely Goofy Movie
443                     Fearless
235                A Goofy Movie
4101               Heartbreakers
4545                         Tap
1617                        Bent
456                      Go Fish
9116            Wishful Thinking
2282                    Rushmore
7970       The City of No Limits
Name: title, dtype: object

#03. Credits, Genres, and Keywords Based Recommender

In [64]:
# Load keywords and credits
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# Remove rows with bad IDs.
metadata = metadata.drop([1000, 1000, 1000])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')


In [65]:
# Print the first two movies of your newly merged metadata
metadata.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [66]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [67]:
# Import Numpy
import numpy as np

In [68]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [69]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [70]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)


In [71]:
# Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)


Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [72]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


In [73]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)


In [74]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


In [75]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)


In [76]:
metadata[['soup']].head(2)


Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [77]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])


In [78]:
count_matrix.shape


(14263, 27248)

In [79]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [80]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])


In [82]:
get_recommendations('Toy Story', cosine_sim2)


3011                                     Toy Story 2
3323                               Creature Comforts
11168                                  Monster House
11641                      The Ugly Duckling and Me!
12421                                     Fallen Art
13590     Kung Fu Panda: Secrets of the Furious Five
1729                                Meet the Deedles
5653     The Looney, Looney, Looney Bugs Bunny Movie
1122                              The Wrong Trousers
1428                                 Jungle 2 Jungle
Name: title, dtype: object

In [83]:
get_recommendations('The Godfather', cosine_sim2)


1925            The Godfather: Part III
1190             The Godfather: Part II
7964     The Night of the Following Day
7735                           Mitchell
1177                     Apocalypse Now
1639                   Ill Gotten Gains
3474         Jails, Hospitals & Hip-Hop
3999                   Gardens of Stone
5284                        The Gambler
11684          The Consequences of Love
Name: title, dtype: object