In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# 0. Notebook description

In this notebook, we load our cleaned dataset and perform a content-based recommendation based on the `Overview`, `Genre`, `Star1`, `Star2`, `Star3`, and `Star4` columns.


# 1. Load dataset

In [2]:
movies_df = pd.read_csv('datasets/imdb_top_1000_cleaned.csv', low_memory=False)

print(movies_df[['Overview', 'Genre', 'Star1', 'Star2', 'Star3', 'Star4']].head())

                                            Overview                 Genre  \
0  Two imprisoned men bond over a number of years...                 Drama   
1  An organized crime dynasty's aging patriarch t...          Crime, Drama   
2  When the menace known as the Joker wreaks havo...  Action, Crime, Drama   
3  The early life and career of Vito Corleone in ...          Crime, Drama   
4  A jury holdout attempts to prevent a miscarria...          Crime, Drama   

            Star1           Star2          Star3           Star4  
0     Tim Robbins  Morgan Freeman     Bob Gunton  William Sadler  
1   Marlon Brando       Al Pacino     James Caan    Diane Keaton  
2  Christian Bale    Heath Ledger  Aaron Eckhart   Michael Caine  
3       Al Pacino  Robert De Niro  Robert Duvall    Diane Keaton  
4     Henry Fonda     Lee J. Cobb  Martin Balsam    John Fiedler  


# 2. Prepare tf-idf model for the `Overview`, `Genre`, `Star1`, `Star2`, `Star3`, and `Star4` columns

We combine all the columns into a single column and concatenate it to one string. We then transform it into numeric vector representations using TF-IDF.

In [3]:
movies_df['Combined'] = movies_df[['Overview', 'Genre', 'Star1', 'Star2', 'Star3', 'Star4']].apply(
    lambda row: ' '.join(row.values.astype(str)), axis=1
)

In [4]:
print(movies_df['Combined'].head())

0    Two imprisoned men bond over a number of years...
1    An organized crime dynasty's aging patriarch t...
2    When the menace known as the Joker wreaks havo...
3    The early life and career of Vito Corleone in ...
4    A jury holdout attempts to prevent a miscarria...
Name: Combined, dtype: object


In [5]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
tfidf_combined = vectorizer.fit_transform(movies_df['Combined'])

print(f'Matrix contains {tfidf_combined.shape[0]} movies and {tfidf_combined.shape[1]} words')

Matrix contains 999 movies and 3428 words


### Inspect the tf-idf model

Print sample feature names from each column.

In [6]:
columns = vectorizer.get_feature_names_out()
print(columns[:100])

['000' '007' '10' '12' '1920s' '1925' '1930s' '1936' '1940s' '1950'
 '1950s' '1960s' '1962' '1969' '1970s' '1980' '1980s' '1984' '1985' '1990'
 '20' '21' '24' '40' 'aamir' 'aaron' 'abandoned' 'abducted' 'abhay'
 'abilities' 'able' 'aboard' 'abraham' 'abuse' 'accepts' 'accident'
 'accidental' 'accidentally' 'accused' 'accuses' 'achieve' 'act' 'acting'
 'action' 'activist' 'activists' 'actor' 'actress' 'acts' 'actually'
 'adam' 'adams' 'adil' 'adolf' 'adopted' 'adrien' 'advanced' 'adventure'
 'adventures' 'adventurous' 'advisor' 'adèle' 'affair' 'affections'
 'affleck' 'affluent' 'africa' 'african' 'age' 'aged' 'agent' 'agents'
 'aging' 'ago' 'agree' 'agrees' 'ahmed' 'aid' 'aided' 'aiello' 'aimée'
 'akbag' 'akhtar' 'akira' 'akshay' 'al' 'alabama' 'alan' 'albert'
 'alcohol' 'alcoholic' 'aldo' 'alec' 'aleksandr' 'aleksey' 'alex'
 'alexander' 'alexandra' 'alexandre' 'ali']


# 3. Find similar movies

To find similar movies, we use the KNN algorithm with **cosine similarity** as a distance metric to find the nearest neighbours.

In [7]:
def get_content_based_recommendation_combined(title, top_n=10, metric='cosine'):
    # Find the index of the movie that matches the title
    idx = movies_df[movies_df.Series_Title.str.lower() == title.lower()].index[0]

    # Build the KNN model
    model = NearestNeighbors(n_neighbors=top_n+1, metric=metric)
    model.fit(tfidf_combined)

    # Find similar movies
    similar_movies = model.kneighbors(tfidf_combined[idx], return_distance=False)[0]
    similar_movies = similar_movies[1:]  # remove the first item (the movie itself)

    # Return the top recommendations
    return movies_df.iloc[similar_movies]

In [8]:
get_content_based_recommendation_combined('The Godfather')[['Series_Title', 'Genre', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Series_Title,Genre,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
973,The Godfather: Part III,"Crime, Drama",Al Pacino,Diane Keaton,Andy Garcia,Talia Shire,7.6,359809
3,The Godfather: Part II,"Crime, Drama",Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,9.0,1129952
397,Scent of a Woman,Drama,Al Pacino,Chris O'Donnell,James Rebhorn,Gabrielle Anwar,8.0,263918
304,On the Waterfront,"Crime, Drama, Thriller",Marlon Brando,Karl Malden,Lee J. Cobb,Rod Steiger,8.1,142107
648,The Insider,"Biography, Drama, Thriller",Russell Crowe,Al Pacino,Christopher Plummer,Diane Venora,7.8,159886
446,A Streetcar Named Desire,Drama,Vivien Leigh,Marlon Brando,Kim Hunter,Karl Malden,8.0,99182
822,Glengarry Glen Ross,"Crime, Drama, Mystery",Al Pacino,Jack Lemmon,Alec Baldwin,Alan Arkin,7.7,95826
108,Scarface,"Crime, Drama",Al Pacino,Michelle Pfeiffer,Steven Bauer,Mary Elizabeth Mastrantonio,8.3,740911
848,Serpico,"Biography, Crime, Drama",Al Pacino,John Randolph,Jack Kehoe,Biff McGuire,7.7,109941
53,Capharnaüm,Drama,Zain Al Rafeea,Yordanos Shiferaw,Boluwatife Treasure Bankole,Kawsar Al Haddad,8.4,62635


In [9]:
get_content_based_recommendation_combined('The Dark Knight')[['Series_Title', 'Genre', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Series_Title,Genre,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
154,Batman Begins,"Action, Adventure",Christian Bale,Michael Caine,Ken Watanabe,Liam Neeson,8.2,1308302
63,The Dark Knight Rises,"Action, Adventure",Christian Bale,Tom Hardy,Anne Hathaway,Gary Oldman,8.4,1516346
772,Brokeback Mountain,"Drama, Romance",Jake Gyllenhaal,Heath Ledger,Michelle Williams,Randy Quaid,7.7,323103
36,The Prestige,"Drama, Mystery, SciFi",Christian Bale,Hugh Jackman,Scarlett Johansson,Michael Caine,8.5,1190259
777,The Machinist,"Drama, Thriller",Christian Bale,Jennifer Jason Leigh,Aitana Sánchez-Gijón,John Sharian,7.7,358432
240,Kill Bill: Vol. 1,"Action, Crime, Drama",Uma Thurman,David Carradine,Daryl Hannah,Michael Madsen,8.1,1000639
33,Joker,"Crime, Drama, Thriller",Joaquin Phoenix,Robert De Niro,Zazie Beetz,Frances Conroy,8.5,939252
831,Empire of the Sun,"Action, Drama, History",Christian Bale,John Malkovich,Miranda Richardson,Nigel Havers,7.7,115677
290,La battaglia di Algeri,"Drama, War",Brahim Hadjadj,Jean Martin,Yacef Saadi,Samia Kerbash,8.1,53089
952,American Psycho,"Comedy, Crime, Drama",Christian Bale,Justin Theroux,Josh Lucas,Bill Sage,7.6,490062


# 2. a) Alternative approach - Prepare tf-idf model for the `Overview`, `Genre`, `Star1`, `Star2`, `Star3`, and `Star4` columns

Instead of concatenating the three columns into one big string, we use separate vectorizers for each column.

First, we combine the actor columns into a single column and then transform them into numeric vector representations using TF-IDF.

In [10]:
# Combine the Star columns into a single string
movies_df['Actors'] = (
        movies_df['Star1'].fillna('') + ' ' +
        movies_df['Star2'].fillna('') + ' ' +
        movies_df['Star3'].fillna('') + ' ' +
        movies_df['Star4'].fillna('')
)

We use different instances of vectorizers and store the feature names for each column separately. Then, we concatenate the TF-IDF matrices horizontally.

In [11]:
# Separate vectorizers for each column
vectorizer_overview = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
vectorizer_genre = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
vectorizer_actors = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)

# Transform each column
tfidf_overview = vectorizer_overview.fit_transform(movies_df['Overview'])
tfidf_genre = vectorizer_genre.fit_transform(movies_df['Genre'])
tfidf_actors = vectorizer_actors.fit_transform(movies_df['Actors'])

# Concatenate the TF-IDF matrices horizontally
tfidf_combined = hstack([tfidf_overview, tfidf_genre, tfidf_actors])

# Feature names for each column
features_overview = vectorizer_overview.get_feature_names_out()
features_genre = vectorizer_genre.get_feature_names_out()
features_actors = vectorizer_actors.get_feature_names_out()

print(f'Matrix contains {tfidf_combined.shape[0]} movies and {tfidf_combined.shape[1]} words')


Matrix contains 999 movies and 3447 words


### Inspect the tf-idf model

Print sample feature names from each column.

In [12]:
print("Overview features:", features_overview[:10])
print("Genre features:", features_genre[:10])
print("Actors features:", features_actors[:10])

Overview features: ['000' '007' '10' '12' '1920s' '1925' '1930s' '1936' '1940s' '1950']
Genre features: ['action' 'adventure' 'animation' 'biography' 'comedy' 'crime' 'drama'
 'family' 'fantasy' 'film']
Actors features: ['aamir' 'aaron' 'abhay' 'abraham' 'adam' 'adams' 'adil' 'adrien' 'adèle'
 'affleck']


# 3. Alternative approach - Find similar movies

To find similar movies, we use the KNN algorithm with **cosine similarity** as a distance metric to find the nearest neighbours.

In [13]:
def get_content_based_recommendation_combined_alternative(title, top_n=10, metric='cosine'):
    # Find the index of the movie that matches the title
    idx = movies_df[movies_df.Series_Title.str.lower() == title.lower()].index[0]

    # Build the KNN model
    model = NearestNeighbors(n_neighbors=top_n+1, metric=metric)
    model.fit(tfidf_combined)

    # Find similar movies
    similar_movies = model.kneighbors(tfidf_combined[idx], return_distance=False)[0]
    similar_movies = similar_movies[1:]  # remove the first item (the movie itself)

    # Return the top recommendations
    return movies_df.iloc[similar_movies]

In [14]:
get_content_based_recommendation_combined_alternative('The Godfather')[['Series_Title', 'Genre', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Series_Title,Genre,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
973,The Godfather: Part III,"Crime, Drama",Al Pacino,Diane Keaton,Andy Garcia,Talia Shire,7.6,359809
3,The Godfather: Part II,"Crime, Drama",Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,9.0,1129952
108,Scarface,"Crime, Drama",Al Pacino,Michelle Pfeiffer,Steven Bauer,Mary Elizabeth Mastrantonio,8.3,740911
638,Lilja 4-ever,"Crime, Drama",Oksana Akinshina,Artyom Bogucharskiy,Pavel Ponomaryov,Lyubov Agapova,7.8,42673
464,Manbiki kazoku,"Crime, Drama",Lily Franky,Sakura Andô,Kirin Kiki,Mayu Matsuoka,7.9,62754
164,Casino,"Crime, Drama",Robert De Niro,Sharon Stone,Joe Pesci,James Woods,8.2,466276
298,Les quatre cents coups,"Crime, Drama",Jean-Pierre Léaud,Albert Rémy,Claire Maurier,Guy Decomble,8.1,105291
668,Boyz n the Hood,"Crime, Drama",Cuba Gooding Jr.,Laurence Fishburne,Hudhail Al-Amir,Lloyd Avery II,7.8,126082
71,Once Upon a Time in America,"Crime, Drama",Robert De Niro,James Woods,Elizabeth McGovern,Treat Williams,8.4,311365
22,Cidade de Deus,"Crime, Drama",Kátia Lund,Alexandre Rodrigues,Leandro Firmino,Matheus Nachtergaele,8.6,699256


In [15]:
get_content_based_recommendation_combined_alternative('The Dark Knight')[['Genre', 'Series_Title', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Genre,Series_Title,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
773,"Action, Crime, Drama",3:10 to Yuma,Russell Crowe,Christian Bale,Ben Foster,Logan Lerman,7.7,288797
240,"Action, Crime, Drama",Kill Bill: Vol. 1,Uma Thurman,David Carradine,Daryl Hannah,Michael Madsen,8.1,1000639
154,"Action, Adventure",Batman Begins,Christian Bale,Michael Caine,Ken Watanabe,Liam Neeson,8.2,1308302
900,"Action, Crime, Drama",End of Watch,Jake Gyllenhaal,Michael Peña,Anna Kendrick,America Ferrera,7.6,228132
967,"Action, Crime, Drama",Falling Down,Michael Douglas,Robert Duvall,Barbara Hershey,Rachel Ticotin,7.6,171640
63,"Action, Adventure",The Dark Knight Rises,Christian Bale,Tom Hardy,Anne Hathaway,Gary Oldman,8.4,1516346
223,"Action, Crime, Drama",A Wednesday,Anupam Kher,Naseeruddin Shah,Jimmy Sheirgill,Aamir Bashir,8.1,73891
887,"Action, Crime, Drama",Baby Driver,Ansel Elgort,Jon Bernthal,Jon Hamm,Eiza González,7.6,439406
673,"Action, Crime, Drama",Dip huet seung hung,Yun-Fat Chow,Danny Lee,Sally Yeh,Kong Chu,7.8,45624
576,"Action, Crime, Drama",Udta Punjab,Shahid Kapoor,Alia Bhatt,Kareena Kapoor,Diljit Dosanjh,7.8,27175
