In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

# Import Abstract Syntax Tree
# Only evaluates literals and does not execute arbitrary code
# Used to parse out the json string
import ast

# Natural Language Toolkit (NLTK)
import nltk
#PorterStemmer is a stemming algorithm that reduces words to their root or base form
from nltk.stem.porter import PorterStemmer

# tool for converting a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Read the movies dataset
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
# Select only the features required
movies = movies[['id', 'title', 'overview', 'genres', 'keywords']]

# List the first record
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


In [4]:
# Create a copy of the DataFrame using the copy() method, for data cleansing

movies_copy = movies.copy()

In [5]:
# Drop null values

movies_copy.dropna(inplace=True)

In [6]:
# Show the first record of genres, it is a json string with id and name

movies_copy.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

## DATA PREPROCESSING

### Select out only the name part from json string

In [7]:
def convert(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List

In [8]:
# Parse out the json by calling function

movies_copy['genres'] = movies_copy['genres'].apply(convert)
movies_copy['keywords'] = movies_copy['keywords'].apply(convert)

In [9]:
movies_copy.head(3)

Unnamed: 0,id,title,overview,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."


In [10]:
# Create the 'overview' feature as comma separated list

movies_copy['overview'] = movies_copy['overview'].apply(lambda x:x.split())

In [11]:
movies_copy.head(3)

Unnamed: 0,id,title,overview,genres,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."


In [12]:
# Removes any spaces with in the 'genres' and 'keywords' features

movies_copy['genres'] = movies_copy['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_copy['keywords'] = movies_copy['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [13]:
movies_copy.head(3)

Unnamed: 0,id,title,overview,genres,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ..."


In [14]:
# Concatenate and create a new feature called 'tags'

movies_copy['tags'] = movies_copy['overview'] + movies_copy['genres'] + movies_copy['keywords']

In [15]:
movies_copy.head(3)

Unnamed: 0,id,title,overview,genres,keywords,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send..."


In [16]:
# Create a new dataframe to store the features
new_df = movies_copy[['id', 'title', 'tags']].copy()

# Create a string containing all the tags separated by space
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [17]:
# Show top 3 records
new_df.head(3)

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


# BERT

In [18]:
# using the Hugging Face transformers library along with a
# pre-trained BERT model to recommend movies based on their titles, overview, genres, keywords

from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import time

# Combine 'title' and 'tags' columns
combined_text = new_df['title'].head(100) + ' ' + new_df['tags'].head(100)
#.head(100)

# User input for the movie name
user_movie_name = input("Enter the name of a movie: ")

# Record the start time
start_time = time.time()

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize movie titles and tags
movie_tokens = tokenizer(combined_text.tolist(), return_tensors='pt', padding=True, truncation=True)
user_tokens = tokenizer(user_movie_name, return_tensors='pt')

# Get BERT embeddings for movies and user input

# Disables gradient calculation for efficiency
with torch.no_grad():
    # Obtains BERT embeddings for the movie titles and tags by taking the
    # mean along the sequence length dimension
    movie_embeddings = model(**movie_tokens).last_hidden_state.mean(dim=1)
    user_embedding = model(**user_tokens).last_hidden_state.mean(dim=1)

# Calculates the cosine similarity between the user input embedding and each movie embedding
similarities = cosine_similarity(user_embedding, movie_embeddings).flatten()

#  Sorts the movies based on their cosine similarity to the user input in descending order
ranked_movies = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
# Display min and max values
min_similarity = min(ranked_movies, key=lambda x: x[1])
max_similarity = max(ranked_movies, key=lambda x: x[1])

print(f"Minimum Similarity: {min_similarity[1]:.4f}")
print(f"Maximum Similarity: {max_similarity[1]:.4f}")

# Extract the top 10 movies
top_10_indices = [movie_index for movie_index, similarity in ranked_movies[:10]]

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Execution time: {elapsed_time} seconds")

#  Print recommended movies
print(f"Recommended Movies for '{user_movie_name}':")
for i, movie_index in enumerate(top_10_indices):
    similarity_score = ranked_movies[i][1]
    print(f"     {new_df['title'][movie_index]} (Similarity score = {similarity_score:.4f})")

Enter the name of a movie: Dar Knight


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Minimum Similarity: 0.2116
Maximum Similarity: 0.4650
Execution time: 9.114442110061646 seconds
Recommended Movies for 'Dar Knight':
     Rush Hour 3 (Similarity score = 0.4650)
     Monsters University (Similarity score = 0.4537)
     The Golden Compass (Similarity score = 0.4279)
     Jupiter Ascending (Similarity score = 0.4265)
     Furious 7 (Similarity score = 0.4261)
     The Jungle Book (Similarity score = 0.4249)
     Prince of Persia: The Sands of Time (Similarity score = 0.4245)
     Star Trek Beyond (Similarity score = 0.4208)
     Cars 2 (Similarity score = 0.4201)
     Pirates of the Caribbean: At World's End (Similarity score = 0.4114)


# FEATURE EXTRACTION FOR WEB

In [19]:
# Convert to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [20]:
# Show the complete 'tags'
new_df['tags'][1]

"captain barbossa, long believed to be dead, has come back to life and is headed to the edge of the earth with will turner and elizabeth swann. but nothing is quite as it seems. adventure fantasy action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger"

In [21]:
ps = PorterStemmer()

In [22]:
# function that takes a text input, splits it into individual words,
# applies stemming using the PorterStemmer (ps), and
# then joins the stemmed words back into a single string
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [23]:
new_df['tags'] = new_df['tags'].apply(stem)

Example of how the function works:

stemmed_text = stem("running in the park and playing")
print(stemmed_text)
run in the park and play

In [24]:
# Show the complete 'tags'
new_df['tags'][1]

"captain barbossa, long believ to be dead, ha come back to life and is head to the edg of the earth with will turner and elizabeth swann. but noth is quit as it seems. adventur fantasi action ocean drugabus exoticisland eastindiatradingcompani loveofone'slif traitor shipwreck strongwoman ship allianc calypso afterlif fighter pirat swashbuckl aftercreditssting"

In [25]:
# Using 'CountVectorizer' to convert 'tags' column into a bag-of-words representation
# The max_features parameter limits the number of features (words)
# to the top 5000 by frequency, and stop_words='english' removes
# common English stop words (like "and", "the", etc.)
cv = CountVectorizer(max_features = 5000,stop_words='english')

# Apply the vectorizer to the 'tags' column
# to first convert to bag of words and then convert to an array
vectors = cv.fit_transform(new_df['tags']).toarray()

# Print values
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
# returns the feature names (unique words) that
# correspond to the columns in the bag-of-words matrix
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [27]:
# Compute the pairwise cosine similarity between the rows (documents) of the bag-of-words matrix
from sklearn.metrics.pairwise import cosine_similarity

# Tuple representing the shape of the matrix, where the first element
# is the number of rows and the second element is the number of columns
cosine_similarity(vectors).shape

(4800, 4800)

In [28]:
# Compute cosine similarity
similarity = cosine_similarity(vectors)

similarity

array([[1.        , 0.09128709, 0.09128709, ..., 0.04622502, 0.        ,
        0.        ],
       [0.09128709, 1.        , 0.06666667, ..., 0.02531848, 0.        ,
        0.02817181],
       [0.09128709, 0.06666667, 1.        , ..., 0.02531848, 0.        ,
        0.        ],
       ...,
       [0.04622502, 0.02531848, 0.02531848, ..., 1.        , 0.03962144,
        0.04279605],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08817334],
       [0.        , 0.02817181, 0.        , ..., 0.04279605, 0.08817334,
        1.        ]])

In [29]:
# The enumerate function to associate each element in the similarity[0] iterable
# with its index. Then, it's creating a list of tuples where each tuple contains
# an index and the corresponding similarity value. Finally, it's sorting this list
# of tuples based on the similarity values in descending order and selecting the
# top 5 indices (excluding the first index, which is the similarity of the document to itself).
sorted(list(enumerate(similarity[0])), reverse=True,key=lambda x:x[1])[1:10]

[(1213, 0.281271975231506),
 (539, 0.26940795304016235),
 (3723, 0.2676516895156553),
 (507, 0.2672612419124244),
 (61, 0.2545875386086578),
 (582, 0.25423758993656337),
 (2403, 0.251259453814803),
 (1191, 0.24873416908154544),
 (1201, 0.2457365935914952)]

## Testing

In [30]:
def recommend(movie):
    try:
        movie_index = new_df[new_df['title'] == movie].index[0]
        distance = similarity[movie_index]
        movies_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])
        movies_list = movies_list[1:20]

        if not movies_list:
            print("No similar movies found.")
        else:
            print("Recommended Movies:")
            for i in movies_list:
                print(new_df.iloc[i[0]].title)
    except IndexError:
        print("Movie not found in the dataset.")

In [31]:
recommend('The Dark Knight')

Recommended Movies:
The Dark Knight Rises
Batman Begins
Batman Returns
Batman Forever
Batman
Batman & Robin
Batman v Superman: Dawn of Justice
Gangster's Paradise: Jerusalema
Amidst the Devil's Wings
Blood Ties
Batman: The Dark Knight Returns, Part 2
Trash
Easy Money
Law Abiding Citizen
The Usual Suspects
The Perfect Host
Sexy Beast
Harsh Times
Sherlock Holmes: A Game of Shadows


In [32]:
# Save the model
joblib.dump(recommend, "modelRecommend.joblib")

NameError: name 'joblib' is not defined

In [None]:
import pickle

pickle.dump(new_df, open('movies_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.load(open('movies_list.pkl', 'rb'))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'new_df' is your DataFrame with the 'tags' and 'title' columns

# Using 'CountVectorizer' to convert 'tags' column into a bag-of-words representation
cv = CountVectorizer(max_features=5000, stop_words='english')

# Apply the vectorizer to the 'tags' column
vectors = cv.fit_transform(new_df['tags']).toarray()

# Compute cosine similarity
similarity = cosine_similarity(vectors)

def recommend(movie):
    try:
        movie_index = new_df[new_df['title'] == movie].index[0]
        distance = similarity[movie_index]
        movies_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])
        movies_list = movies_list[1:20]

        if not movies_list:
            print("No similar movies found.")
        else:
            print("Recommended Movies:")
            for i in movies_list:
                title = new_df.iloc[i[0]].title
                score = i[1]
                print(f"{title} (Similarity Score: {score:.4f})")

    except IndexError:
        print("Movie not found in the dataset.")

# Example: Get recommendations for a movie
recommend("The Dark Knight")

Future workings:
1. Not text sensitive, can type with % and should show results
2. Create similar process for the TV shows across multiple subscribers.

# TF-IDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the TMDB 5000 Movie Dataset
movies_df = pd.read_csv('tmdb_5000_movies.csv')

# Example: Concatenate relevant text features (title, genres, and overview)
movies_df['combined_features'] = movies_df['title'] + ' ' + movies_df['genres'] + ' ' + movies_df['overview'].fillna('')

# Record the start time
start_time = time.time()

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies_df['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

import time

# Function to get movie recommendations based on cosine similarity
def get_recommendations(movie_title):
    try:
        start_time = time.time()

        movie_index = movies_df.index[movies_df['title'] == movie_title].tolist()[0]
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:20]
        movie_indices = [i[0] for i in sim_scores]
        recommended_movies = movies_df[['title', 'genres']].iloc[movie_indices]
        recommended_movies['score'] = [score[1] for score in sim_scores]

        end_time = time.time()
        print(f"Execution time: {end_time - start_time} seconds")

        return recommended_movies
    except IndexError:
        print("Movie not found in the dataset.")
        return None

# Example: Get recommendations for a movie
movie_title = 'The Dark Knight'
recommendations = get_recommendations(movie_title)

if recommendations is not None:
    print(f"Recommendations for {movie_title}:\n{recommendations[['title', 'score']]}")