# Content-Based Movie Recommender

##### The project comprises two parts. In the first part, the system utilises movie contents to find similar movies using TF-IDF vectors derived from movie summaries. In the second part, the system recommends similar movies based on their genres. By analysing the genre information of movies, the recommender identifies movies with similar categorizations.



In [232]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [233]:
# Read the data
current_dir = os.getcwd()

# Construct the file path
file_name = "movie_data_2.csv"
file_path = os.path.join(current_dir, file_name)
data_indexed = pd.read_csv(file_path)
data_indexed.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [234]:
# Drop irrelevant columns
data_indexed = data_indexed[["title", "budget", "genres", "id", "overview", "popularity", "vote_average", "vote_count"]]

In [235]:
# Processing movie genres

def genres(info):
    a = [s[1][9:len(s[1])-2] for s in enumerate(info.split(", ")) if s[0]%2==1]
    if a!=[]:
        a[-1] = a[-1][:-1]
    return " ".join(a)

data_indexed["genres"] = data_indexed["genres"].apply(genres)

data = data_indexed.set_index('title')

In [236]:
data.head(3)

Unnamed: 0_level_0,budget,genres,id,overview,popularity,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Avatar,237000000,Action Adventure Fantasy Science Fiction,19995,"In the 22nd century, a paraplegic Marine is di...",150.437577,7.2,11800
Pirates of the Caribbean: At World's End,300000000,Adventure Fantasy Action,285,"Captain Barbossa, long believed to be dead, ha...",139.082615,6.9,4500
Spectre,245000000,Action Adventure Crime,206647,A cryptic message from Bond’s past sends him o...,107.376788,6.3,4466


In [237]:
# Drop null values
data.isnull().sum()
data = data.dropna()

In [238]:
data_genres = data[["genres"]]

In [239]:
data_overview = data[["overview"]]



### Find similar movies based on movie overviews.

In [240]:
# Creating TF-IDF matrix for movie overviews

overview_tvf = TfidfVectorizer(min_df = 3)    # term must be included at least in 3 documents 
overview_tfidf_matrix = overview_tvf.fit_transform(data_overview['overview'])

dense_array = overview_tfidf_matrix.toarray()

overview_tfidf = pd.DataFrame(dense_array, columns=overview_tvf.get_feature_names_out())
overview_tfidf.index = data_overview.index

# Transform the current matrix to a transpose matrix to calculate the correlation between movies 
overview_tfidf = overview_tfidf.T  

overview_tfidf.head()

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
# Recommends similar movies based on their overviews.

def overview_based_recom(name_):
    
    global overview_tfidf
    
    if isinstance(name_, int) and name_<4800:
        name_ = data_indexed.iloc[name_].loc["title"]
        
    try:
        print(f"Recommendation for \"{name_}\":\n")
        movie_1 = overview_tfidf[name_]
        
        # Calculate correlation of the specified movie with other movies
        similar = overview_tfidf.corrwith(movie_1)
        similar = pd.DataFrame(similar,columns=['Correlation'])
        
        # Sort values
        similar = similar['Correlation'].sort_values(ascending = False)[1:11].index
        [print(f"{x[0]+1}: {x[1]}") for x in enumerate(list(similar))]
        
        return similar
    except:
        print("movie is not in database")
        



In [247]:
# EXAMPLE      
overview_based_recom("You've Got Mail") 

Recommendation for "You've Got Mail":

1: The Salon
2: They Came Together
3: Déjà Vu
4: Solitary Man
5: The Number 23
6: The Ninth Gate
7: The NeverEnding Story
8: Chasing Amy
9: Miss Potter
10: The SpongeBob Movie: Sponge Out of Water


Index(['The Salon', 'They Came Together', 'Déjà Vu', 'Solitary Man',
       'The Number 23', 'The Ninth Gate', 'The NeverEnding Story',
       'Chasing Amy', 'Miss Potter',
       'The SpongeBob Movie: Sponge Out of Water'],
      dtype='object', name='title')



### Find similar movies based on movie overviews.

In [243]:
# Creating TF-IDF matrix for movie genres

genre_tvf = TfidfVectorizer() 
genre_tfidf_matrix = genre_tvf.fit_transform(data_genres['genres'])

genre_tfidf = pd.DataFrame(genre_tfidf_matrix.toarray(), columns=genre_tvf.get_feature_names_out())
genre_tfidf.index = data_genres.index

genre_tfidf.head()

Unnamed: 0_level_0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,fiction,...,horror,movie,music,mystery,romance,science,thriller,tv,war,western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,0.358097,0.414005,0.0,0.0,0.0,0.0,0.0,0.0,0.505749,0.471479,...,0.0,0.0,0.0,0.0,0.0,0.471479,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,0.480499,0.555516,0.0,0.0,0.0,0.0,0.0,0.0,0.678619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,0.513218,0.593344,0.0,0.0,0.620121,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,0.506796,0.0,0.0,0.0,0.612362,0.0,0.3631,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.486136,0.0,0.0,0.0
John Carter,0.415098,0.479905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546528,...,0.0,0.0,0.0,0.0,0.0,0.546528,0.0,0.0,0.0,0.0


In [244]:
# Transform to transpose matrix
genre_tfidf = genre_tfidf.T
genre_tfidf.head()

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
action,0.358097,0.480499,0.513218,0.506796,0.415098,0.480499,0.0,0.415098,0.0,0.480499,...,0.0,0.0,0.0,0.0,0.0,0.543919,0.0,0.0,0.0,0.0
adventure,0.414005,0.555516,0.593344,0.0,0.479905,0.555516,0.0,0.479905,0.51141,0.555516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
animation,0.0,0.0,0.0,0.0,0.0,0.0,0.778901,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.453937,0.0,0.0,0.0,0.0,0.602843,0.184621,0.0,0.0
crime,0.0,0.0,0.620121,0.612362,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.657217,0.0,0.0,0.0,0.0


In [248]:
# Recommends similar movies based on their genres.

def genre_based_recom(movie_name):
    
    global genre_tfidf
    print(f"Recommendation for \"{movie_name}\":\n")
    
    # Calculate correlation
    similar = genre_tfidf.corrwith(genre_tfidf[movie_name])
    
    # Sorting values
    similar = similar.sort_values(ascending = False)[1:11].index
    [print(f"{x[0]+1}: {x[1]}") for x in enumerate(list(similar))]
    
    return similar



In [249]:
# Example 
genre_based_recom("Avatar")

Recommendation for "Avatar":

1: Superman Returns
2: Jupiter Ascending
3: X-Men: Days of Future Past
4: Superman II
5: Beastmaster 2: Through the Portal of Time
6: Man of Steel
7: Superman
8: The Wolverine
9: Superman III
10: Mystery Men


Index(['Superman Returns', 'Jupiter Ascending', 'X-Men: Days of Future Past',
       'Superman II', 'Beastmaster 2: Through the Portal of Time',
       'Man of Steel', 'Superman', 'The Wolverine', 'Superman III',
       'Mystery Men'],
      dtype='object', name='title')