In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Loading the data and data cleaning

In [2]:
df = pd.read_csv('data/Top_Movies.csv')
df.head()

Unnamed: 0,Movie Name,Rating,Runtime,Genre,Metascore,Plot,Directors,Stars,Votes,Gross,Link
0,The Godfather,9.2,175 min,"Crime, Drama",100.0,"Don Vito Corleone, head of a mafia family, dec...",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1914405,134966411,https://www.imdb.com/title/tt0068646/
1,The Shawshank Redemption,9.3,142 min,Drama,82.0,"Over the course of several years, two convicts...",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2751997,28341469,https://www.imdb.com/title/tt0111161/
2,Shichinin no samurai,8.6,207 min,"Action, Drama",98.0,Farmers from a village exploited by bandits hi...,Akira Kurosawa,"Toshirô Mifune, Takashi Shimura, Keiko Tsushim...",353392,269061,https://www.imdb.com/title/tt0047478/
3,Cidade de Deus,8.6,130 min,"Crime, Drama",79.0,"In the slums of Rio, two kids' paths diverge a...",Fernando Meirelles,"Kátia Lund, Alexandre Rodrigues, Leandro Firmi...",772169,7563397,https://www.imdb.com/title/tt0317248/
4,The Godfather Part II,9.0,202 min,"Crime, Drama",90.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall, Dian...",1303664,57300000,https://www.imdb.com/title/tt0071562/


Check for missing values and choose the relevant columns for the analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie Name  502 non-null    object 
 1   Rating      502 non-null    float64
 2   Runtime     502 non-null    object 
 3   Genre       502 non-null    object 
 4   Metascore   480 non-null    float64
 5   Plot        502 non-null    object 
 6   Directors   502 non-null    object 
 7   Stars       502 non-null    object 
 8   Votes       502 non-null    int64  
 9   Gross       458 non-null    object 
 10  Link        502 non-null    object 
dtypes: float64(2), int64(1), object(8)
memory usage: 43.3+ KB


There are no missing values for the relevant columns so now I just take the 2 columns that are important for this analysis and rename it.

In [4]:
df = df[['Movie Name', 'Plot']]
df.rename(columns={'Movie Name': 'title', 'Plot': 'description'}, inplace=True)
df.head()

Unnamed: 0,title,description
0,The Godfather,"Don Vito Corleone, head of a mafia family, dec..."
1,The Shawshank Redemption,"Over the course of several years, two convicts..."
2,Shichinin no samurai,Farmers from a village exploited by bandits hi...
3,Cidade de Deus,"In the slums of Rio, two kids' paths diverge a..."
4,The Godfather Part II,The early life and career of Vito Corleone in ...


Checking if it contains any duplicate title and remove it if there are any duplicates

In [5]:
df['title'].nunique()

450

In [6]:
df = df.drop_duplicates(subset='title', keep='first')
df.head()

Unnamed: 0,title,description
0,The Godfather,"Don Vito Corleone, head of a mafia family, dec..."
1,The Shawshank Redemption,"Over the course of several years, two convicts..."
2,Shichinin no samurai,Farmers from a village exploited by bandits hi...
3,Cidade de Deus,"In the slums of Rio, two kids' paths diverge a..."
4,The Godfather Part II,The early life and career of Vito Corleone in ...


### Vectorizing the data

In [7]:
# Initialize TF-IDF vectorizer with English stopwords
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the movie descriptions
tfidf_matrix = vectorizer.fit_transform(df['description'])

# Display the shape of the resulting matrix
tfidf_matrix.shape


(450, 3352)

In [8]:
# Function to recommend movies based on user input
def recommend_movies(query):
    # Convert query to TF-IDF vector
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity between query and movie descriptions
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top N similar movie indices
    top_indices = np.argsort(similarities)[::-1][:5]
    
    # Fetch recommended movie titles with similarity scores
    recommendations = [(df.iloc[idx]['title'], similarities[idx]) for idx in top_indices]
    
    return recommendations


### Computing the similarity

In [14]:
test_query = 'I love thrilling action movies set in space, with a comedic twist.'
recommendations = recommend_movies(test_query)

recommendations

[('Gravity', np.float64(0.1874574829193349)),
 ('Clerks', np.float64(0.15149353815534727)),
 ('Blade Runner', np.float64(0.1365436067298417)),
 ('The Incredibles', np.float64(0.13616673588442266)),
 ('WALL·E', np.float64(0.1266038266002731))]

### Try it out yourself!

To test out the analysis just change the **query_description** to anything you want and run the recommend_movies function

In [12]:
query_description = "I love romance comedy movies."

In [13]:
recommend_movies(query_description)

[('Crazy Heart', np.float64(0.16980078153858147)),
 ('Clerks', np.float64(0.15536001039605882)),
 ('Todo sobre mi madre', np.float64(0.14592684132320902)),
 ('The Apartment', np.float64(0.1411897503252735)),
 ('Roman Holiday', np.float64(0.07080155570344575))]