In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# 0. Notebook description

In this notebook, we load our cleaned dataset and perform a content-based recommendation based on the `Star1`, `Star2`, `Star3`, and `Star4` columns, which contain the actors of each movie.


# 1. Load dataset

In [21]:
movies_df = pd.read_csv('datasets/imdb_top_1000_cleaned.csv', low_memory=False)

print(movies_df[['Star1', 'Star2', 'Star3', 'Star4']].head())

            Star1           Star2          Star3           Star4
0     Tim Robbins  Morgan Freeman     Bob Gunton  William Sadler
1   Marlon Brando       Al Pacino     James Caan    Diane Keaton
2  Christian Bale    Heath Ledger  Aaron Eckhart   Michael Caine
3       Al Pacino  Robert De Niro  Robert Duvall    Diane Keaton
4     Henry Fonda     Lee J. Cobb  Martin Balsam    John Fiedler


# 2. Prepare tf-idf model for the `Genre` column

We combine the actor columns into a single column and then transform them into numeric vector representations using TF-IDF.

In [22]:
# Combine the Star columns into a single string
movies_df['Actors'] = (
        movies_df['Star1'].fillna('') + ' ' +
        movies_df['Star2'].fillna('') + ' ' +
        movies_df['Star3'].fillna('') + ' ' +
        movies_df['Star4'].fillna('')
)

In [23]:
# Print the first 10 rows of the new DataFrame
print(movies_df.head(10))

                                         Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   
5  https://m.media-amazon.com/images/M/MV5BNzA5ZD...   
6  https://m.media-amazon.com/images/M/MV5BNGNhMD...   
7  https://m.media-amazon.com/images/M/MV5BNDE4OT...   
8  https://m.media-amazon.com/images/M/MV5BMjAxMz...   
9  https://m.media-amazon.com/images/M/MV5BMmEzNT...   

                                    Series_Title Released_Year Certificate  \
0                       The Shawshank Redemption          1994           A   
1                                  The Godfather          1972           A   
2                                The Dark Knight          2008          UA   
3                         The Godfather: Part II          1974         

In [24]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)

tfidf_actors = vectorizer.fit_transform(movies_df['Actors'])

print(f'Matrix contains {tfidf_actors.shape[0]} movies and {tfidf_actors.shape[1]} words')

Matrix contains 1000 movies and 1377 words


# 3. Find similar movies

To find similar movies, we use the KNN algorithm with **cosine similarity** as a distance metric to find the nearest neighbours.

In [25]:
def get_content_based_recommendation_actors(title, top_n=10, metric='cosine'):
    # Find the index of the movie that matches the title
    idx = movies_df[movies_df.Series_Title.str.lower() == title.lower()].index[0]

    # Build the KNN model
    model = NearestNeighbors(n_neighbors=top_n+1, metric=metric)
    model.fit(tfidf_actors)

    # Find similar movies
    similar_movies = model.kneighbors(tfidf_actors[idx], return_distance=False)[0]
    similar_movies = similar_movies[1:]  # remove the first item (the movie itself)

    # Return the top recommendations
    return movies_df.iloc[similar_movies]

In [26]:
get_content_based_recommendation_actors('The Godfather')[['Series_Title', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Series_Title,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
3,The Godfather: Part II,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,9.0,1129952
974,The Godfather: Part III,Al Pacino,Diane Keaton,Andy Garcia,Talia Shire,7.6,359809
398,Scent of a Woman,Al Pacino,Chris O'Donnell,James Rebhorn,Gabrielle Anwar,8.0,263918
649,The Insider,Russell Crowe,Al Pacino,Christopher Plummer,Diane Venora,7.8,159886
53,Capharnaüm,Zain Al Rafeea,Yordanos Shiferaw,Boluwatife Treasure Bankole,Kawsar Al Haddad,8.4,62635
533,Manhattan,Woody Allen,Diane Keaton,Mariel Hemingway,Michael Murphy,7.9,131436
74,Apocalypse Now,Martin Sheen,Marlon Brando,Robert Duvall,Frederic Forrest,8.4,606398
305,On the Waterfront,Marlon Brando,Karl Malden,Lee J. Cobb,Rod Steiger,8.1,142107
447,A Streetcar Named Desire,Vivien Leigh,Marlon Brando,Kim Hunter,Karl Malden,8.0,99182
414,Annie Hall,Woody Allen,Diane Keaton,Tony Roberts,Carol Kane,8.0,251823


In [27]:
get_content_based_recommendation_actors('The Dark Knight')[['Series_Title', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating', 'No_of_Votes']]

Unnamed: 0,Series_Title,Star1,Star2,Star3,Star4,IMDB_Rating,No_of_Votes
36,The Prestige,Christian Bale,Hugh Jackman,Scarlett Johansson,Michael Caine,8.5,1190259
155,Batman Begins,Christian Bale,Michael Caine,Ken Watanabe,Liam Neeson,8.2,1308302
773,Brokeback Mountain,Jake Gyllenhaal,Heath Ledger,Michelle Williams,Randy Quaid,7.7,323103
819,The Muppet Christmas Carol,Michael Caine,Kermit the Frog,Dave Goelz,Miss Piggy,7.7,50298
778,The Machinist,Christian Bale,Jennifer Jason Leigh,Aitana Sánchez-Gijón,John Sharian,7.7,358432
217,Ford v Ferrari,Matt Damon,Christian Bale,Jon Bernthal,Caitriona Balfe,8.1,291289
832,Empire of the Sun,Christian Bale,John Malkovich,Miranda Richardson,Nigel Havers,7.7,115677
953,American Psycho,Christian Bale,Justin Theroux,Josh Lucas,Bill Sage,7.6,490062
692,The Man Who Would Be King,Sean Connery,Michael Caine,Christopher Plummer,Saeed Jaffrey,7.8,44917
63,The Dark Knight Rises,Christian Bale,Tom Hardy,Anne Hathaway,Gary Oldman,8.4,1516346
