## Import Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances
from sklearn.neighbors import NearestNeighbors
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

## Data Understanding

In [3]:
movies = pd.read_csv("TMBD Movie Dataset.csv")
movies.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,keywords,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,profit,popularity_level
0,0,135397,tt0369610,32.985763,150000000.0,1513529000.0,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,5562,6.5,2015,137999900.0,1392446000.0,1363529000.0,High
1,1,76341,tt1392190,28.419936,150000000.0,378436400.0,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,6185,7.1,2015,137999900.0,348161300.0,228436400.0,High
2,2,262500,tt2908446,13.112507,110000000.0,295238200.0,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,based on novel|revolution|dystopia|sequel|dyst...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015-03-18,2480,6.3,2015,101200000.0,271619000.0,185238200.0,High
3,3,140607,tt2488496,11.173104,200000000.0,2068178000.0,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,android|spaceship|jedi|space opera|3d,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015-12-15,5292,7.5,2015,183999900.0,1902723000.0,1868178000.0,High
4,4,168259,tt2820852,9.335014,190000000.0,1506249000.0,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,car race|speed|revenge|suspense|car,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015-04-01,2947,7.3,2015,174799900.0,1385749000.0,1316249000.0,High


In [4]:
movies.shape

(1287, 24)

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1287 entries, 0 to 1286
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1287 non-null   int64  
 1   id                    1287 non-null   int64  
 2   imdb_id               1287 non-null   object 
 3   popularity            1287 non-null   float64
 4   budget                1287 non-null   float64
 5   revenue               1287 non-null   float64
 6   original_title        1287 non-null   object 
 7   cast                  1287 non-null   object 
 8   homepage              1287 non-null   object 
 9   director              1287 non-null   object 
 10  tagline               1287 non-null   object 
 11  keywords              1287 non-null   object 
 12  overview              1287 non-null   object 
 13  runtime               1287 non-null   int64  
 14  genres                1287 non-null   object 
 15  production_companies 

In [6]:
movies.columns

Index(['Unnamed: 0', 'id', 'imdb_id', 'popularity', 'budget', 'revenue',
       'original_title', 'cast', 'homepage', 'director', 'tagline', 'keywords',
       'overview', 'runtime', 'genres', 'production_companies', 'release_date',
       'vote_count', 'vote_average', 'release_year', 'budget_adj',
       'revenue_adj', 'profit', 'popularity_level'],
      dtype='object')

## Movie Cast, Crew, Keywords, Genres Based Recommender

## Data Preprocessing

In [7]:
movies.isnull().sum()

Unnamed: 0              0
id                      0
imdb_id                 0
popularity              0
budget                  0
revenue                 0
original_title          0
cast                    0
homepage                0
director                0
tagline                 0
keywords                0
overview                0
runtime                 0
genres                  0
production_companies    0
release_date            0
vote_count              0
vote_average            0
release_year            0
budget_adj              0
revenue_adj             0
profit                  0
popularity_level        1
dtype: int64

In [8]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,Action|Adventure|Science Fiction|Thriller
1,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,Action|Adventure|Science Fiction|Thriller
2,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,Adventure|Science Fiction|Thriller
3,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,Action|Adventure|Science Fiction|Fantasy
4,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,Action|Crime|Thriller


In [9]:
def format_names(x):
    # Check if the input is a string and contains '|'
    if isinstance(x, str) and '|' in x:
        # Split the string by '|', strip whitespace, and return as a list
        return [name.strip() for name in x.split('|')]
    else:
        # If input is not a string or doesn't contain '|', return an empty list
        return []

# Specify columns to process
features = ['cast', 'keywords', 'genres']

# Apply the format_names function to each specified feature column
for feature in features:
    movies[feature] = movies[feature].apply(format_names)

In [10]:
def format_director(x):
    # Check if the input is a non-empty string
    if isinstance(x, str) and x.strip():  # Ensure the string is not empty after stripping whitespace
        return [x.strip()]  # Return a list with the stripped director name
    else:
        return []  # Return an empty list for invalid or empty input

# Apply the format_director function to the 'director' column
movies['director'] = movies['director'].apply(format_director)

In [11]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",[Colin Trevorrow],"[monster, dna, tyrannosaurus rex, velociraptor...","[Action, Adventure, Science Fiction, Thriller]"
1,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",[George Miller],"[future, chase, post-apocalyptic, dystopia, au...","[Action, Adventure, Science Fiction, Thriller]"
2,"[Shailene Woodley, Theo James, Kate Winslet, A...",[Robert Schwentke],"[based on novel, revolution, dystopia, sequel,...","[Adventure, Science Fiction, Thriller]"
3,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",[J.J. Abrams],"[android, spaceship, jedi, space opera, 3d]","[Action, Adventure, Science Fiction, Fantasy]"
4,"[Vin Diesel, Paul Walker, Jason Statham, Miche...",[James Wan],"[car race, speed, revenge, suspense, car]","[Action, Crime, Thriller]"


In [12]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [13]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

In [14]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,"[chrispratt, brycedallashoward, irrfankhan, vi...",[colintrevorrow],"[monster, dna, tyrannosaurusrex, velociraptor,...","[action, adventure, sciencefiction, thriller]"
1,"[tomhardy, charlizetheron, hughkeays-byrne, ni...",[georgemiller],"[future, chase, post-apocalyptic, dystopia, au...","[action, adventure, sciencefiction, thriller]"
2,"[shailenewoodley, theojames, katewinslet, anse...",[robertschwentke],"[basedonnovel, revolution, dystopia, sequel, d...","[adventure, sciencefiction, thriller]"
3,"[harrisonford, markhamill, carriefisher, adamd...",[j.j.abrams],"[android, spaceship, jedi, spaceopera, 3d]","[action, adventure, sciencefiction, fantasy]"
4,"[vindiesel, paulwalker, jasonstatham, michelle...",[jameswan],"[carrace, speed, revenge, suspense, car]","[action, crime, thriller]"


In [15]:
def create_soup(row):
    # Join keywords, cast, and genres into a single string
    soup_parts = []
    
    # Append keywords (if available)
    if isinstance(row['keywords'], list):
        soup_parts.extend(row['keywords'])
    
    # Append cast (if available)
    if isinstance(row['cast'], list):
        soup_parts.extend(row['cast'])
    
    # Append director (if available)
    if isinstance(row['director'], str):
        soup_parts.append(row['director'])
    
    # Append genres (if available)
    if isinstance(row['genres'], list):
        soup_parts.extend(row['genres'])
    
    # Join all soup parts into a single string
    return ' '.join(soup_parts)

# Apply the create_soup function to each row of the DataFrame along axis=1
movies['soup'] = movies.apply(create_soup, axis=1)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

### Constructing TF-IDF Matrix

Term Frequency (TF) is the relative frequency of a word in a document and is given as (term instances/total instances). Inverse Document Frequency (IDF) is the relative count of documents containing the term and is given as log(number of documents/documents with term). The overall importance of each word to the documents in which they appear is equal to TF * IDF

This gives us a matrix where each column represents a word in the overall overview vocabulary and each row represents a movie.This is done to reduce the importance of words that occur frequently in plot overviews and therefore, their significance in computing the final similarity score.

In [19]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['overview'])
print(tfidf_matrix.todense())
tfidf_matrix.todense().shape

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


(1287, 10470)

### Computing Similarity Score

In [20]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [21]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['original_title'])

## Modelling

In [22]:
# Initialize Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
nn_model.fit(tfidf_matrix)

In [23]:
def get_cosine_similarity_recommendations(title, no_of_recommendations):
    index = movies[movies['original_title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:no_of_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['original_title', 'release_year', 'director']]

def get_knn_recommendations(title, no_of_recommendations):
    title_vector = tfidf_vectorizer.transform([title])
    distances, indices = nn_model.kneighbors(title_vector)
    movie_indices = indices.flatten()[1:no_of_recommendations+1]
    return movies.iloc[movie_indices][['original_title', 'release_year', 'director']]

def get_euclidean_distance_recommendations(title, no_of_recommendations):
    index = movies[movies['original_title'] == title].index[0]
    dist_scores = list(enumerate(euclidean_distances(tfidf_matrix[index], tfidf_matrix)[0]))
    dist_scores = sorted(dist_scores, key=lambda x: x[1])[:no_of_recommendations]
    movie_indices = [i[0] for i in dist_scores]
    return movies.iloc[movie_indices][['original_title', 'release_year', 'director']]

In [24]:
# Choose a movie title
input_title = "The Dark Knight"

cosine_sim_recommendations = get_cosine_similarity_recommendations(input_title, 5)
print("Recommendations using Cosine Similarity:")
print(cosine_sim_recommendations)

# Get recommendations using KNN
knn_recommendations = get_knn_recommendations(input_title, 5)
print("\nRecommendations using KNN:")
print(knn_recommendations)

# Get recommendations using Euclidean Distance
euclidean_dist_recommendations = get_euclidean_distance_recommendations(input_title, 5)
print("\nRecommendations using Euclidean Distance:")
print(euclidean_dist_recommendations)

Recommendations using Cosine Similarity:
                         original_title  release_year  \
722               The Dark Knight Rises          2012   
914                       Batman Begins          2005   
554  Sherlock Holmes: A Game of Shadows          2011   
857                     Despicable Me 2          2013   
80                 Secret in Their Eyes          2015   

                       director  
722          [christophernolan]  
914          [christophernolan]  
554                [guyritchie]  
857  [pierrecoffin|chrisrenaud]  
80                   [billyray]  

Recommendations using KNN:
             original_title  release_year            director
722   The Dark Knight Rises          2012  [christophernolan]
143      As Above, So Below          2014   [johnerickdowdle]
908             The Canyons          2013      [paulschrader]
1139     Death at a Funeral          2007           [frankoz]
105                    Lucy          2014         [lucbesson]

Recommendat

In [26]:
# Relevant items related to "The Dark Knight" (ground truth)
# relevant_items = ["The Dark Knight Rises", "Inception", "Interstellar", "Batman Begins", "The Dark Knight Trilogy"]
relevant_items = ["Avengers: Age of Ultron", "Guardians of the Galaxy", "Iron Man 2", "Captain America: The Winter Soldier", "Thor"]

# Function to calculate Precision, Recall, and F1-score
def evaluate_recommendations(recommended_items, relevant_items, K=5):
    recommended_set = set(recommended_items)
    relevant_set = set(relevant_items)
    true_positives = recommended_set.intersection(relevant_set)
    
    # Calculate Precision, Recall, and F1-score
    precision = len(true_positives) / K if K > 0 else 0.0
    recall = len(true_positives) / len(relevant_set) if len(relevant_set) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return precision, recall, f1

# Choose a movie title
# input_title = "The Dark Knight"
input_title = "Captain America: The First Avenger"

# Get recommendations using Cosine Similarity
cosine_sim_recommendations = get_cosine_similarity_recommendations(input_title, 5)
cosine_sim_titles = cosine_sim_recommendations['original_title'].tolist()
precision_cosine, recall_cosine, f1_cosine = evaluate_recommendations(cosine_sim_titles, relevant_items, K=5)

# Get recommendations using KNN
knn_recommendations = get_knn_recommendations(input_title, 5)
knn_titles = knn_recommendations['original_title'].tolist()
precision_knn, recall_knn, f1_knn = evaluate_recommendations(knn_titles, relevant_items, K=5)

# Get recommendations using Euclidean Distance
euclidean_dist_recommendations = get_euclidean_distance_recommendations(input_title, 5)
euclidean_dist_titles = euclidean_dist_recommendations['original_title'].tolist()
precision_euclidean, recall_euclidean, f1_euclidean = evaluate_recommendations(euclidean_dist_titles, relevant_items, K=5)

# Display evaluation results with three decimal places
print("Evaluation Results:")
print(f"Cosine Similarity Approach - Precision: {precision_cosine:.3f}")
print(f"Cosine Similarity Approach - Recall: {recall_cosine:.3f}")
print(f"Cosine Similarity Approach - F1-score: {f1_cosine:.3f}")

print(f"\nKNN Approach - Precision: {precision_knn:.3f}")
print(f"KNN Approach - Recall: {recall_knn:.3f}")
print(f"KNN Approach - F1-score: {f1_knn:.3f}")

print(f"\nEuclidean Distance Approach - Precision: {precision_euclidean:.3f}")
print(f"Euclidean Distance Approach - Recall: {recall_euclidean:.3f}")
print(f"Euclidean Distance Approach - F1-score: {f1_euclidean:.3f}")

Evaluation Results:
Cosine Similarity Approach - Precision: 0.200
Cosine Similarity Approach - Recall: 0.200
Cosine Similarity Approach - F1-score: 0.200

KNN Approach - Precision: 0.000
KNN Approach - Recall: 0.000
KNN Approach - F1-score: 0.000

Euclidean Distance Approach - Precision: 0.200
Euclidean Distance Approach - Recall: 0.200
Euclidean Distance Approach - F1-score: 0.200
