
# <h1 align="center"> Movie Recommendations with Document Similarity </h1>


<hr />




![](https://i.imgur.com/c7Go7d3.png)

# Install Dependencies

In [None]:
!pip install textsearch
!pip install contractions
import nltk
nltk.download('punkt')
nltk.download('stopwords')

Collecting textsearch
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch)
  Downloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (13 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch
Successfully installed anyascii-0.3.3 pyahocorasick-2.3.0 textsearch-0.0.24
Collecting contractions
  Downloading contr

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load and View Data

In [None]:
import pandas as pd

df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2010%20-%20Project%208%20-%20Movie%20Recommendations%20with%20Document%20Similarity/tmdb_5000_movies.csv.gz', compression='gzip')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [None]:
df.head()

## Let's focus on only the tagline and overview fields

In [None]:
df = df[['title', 'tagline', 'overview', 'popularity']]
df.tagline.fillna('', inplace=True)

df['description'] = df['tagline'] + ' '+ df['overview']

df.dropna(inplace=True, axis = 0)
df = df.sort_values(by=['popularity'], ascending=False).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        4800 non-null   object 
 1   tagline      4800 non-null   object 
 2   overview     4800 non-null   object 
 3   popularity   4800 non-null   float64
 4   description  4800 non-null   object 
dtypes: float64(1), object(4)
memory usage: 187.6+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.tagline.fillna('', inplace=True)


In [None]:
df.head()

Unnamed: 0,title,tagline,overview,popularity,description
0,Minions,"Before Gru, they had a history of bad bosses","Minions Stuart, Kevin and Bob are recruited by...",875.581305,"Before Gru, they had a history of bad bosses M..."
1,Interstellar,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,724.247784,Mankind was born on Earth. It was never meant ...
2,Deadpool,Witness the beginning of a happy ending,Deadpool tells the origin story of former Spec...,514.569956,Witness the beginning of a happy ending Deadpo...
3,Guardians of the Galaxy,All heroes start somewhere.,"Light years from Earth, 26 years after being a...",481.098624,All heroes start somewhere. Light years from E...
4,Mad Max: Fury Road,What a Lovely Day.,An apocalyptic story set in the furthest reach...,434.278564,What a Lovely Day. An apocalyptic story set in...


# Build a Movie Recommender System

Pipeline
- Text pre-processing
- Feature Engineering
- Document Similarity Computation
- Find top similar movies
- Build a movie recommendation function

## Text pre-processing

In [None]:
import re
import numpy as np
import contractions
nltk.download('punkt_tab')
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # fix contractions
    doc = contractions.fix(doc)
    # remove special characters
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A)
    # lower case
    doc = doc.lower()
    # strip whitespaces
    doc = re.sub(' +', ' ', doc)
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    #filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['description']))
len(norm_corpus)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


4800

## Extract TF-IDF Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(use_idf=True)
tfidf_matrix = tv.fit_transform(norm_corpus)
tfidf_matrix = tfidf_matrix.toarray()

## Compute Pairwise Document Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
0,1.0,0.0,0.0,0.0,0.006865,0.008287,0.0,0.0,0.0,0.0,...,0.018355,0.0,0.035387,0.0,0.0,0.0,0.0,0.0,0.0,0.011443
1,0.0,1.0,0.0,0.018854,0.009756,0.0,0.0,0.016162,0.0,0.01489,...,0.0,0.0,0.017742,0.0,0.029033,0.0,0.0,0.0,0.0,0.010228
2,0.0,0.0,1.0,0.0,0.018351,0.0,0.0,0.0,0.0,0.021294,...,0.0,0.007159,0.004427,0.0,0.017052,0.0,0.01968,0.0,0.029918,0.010467
3,0.0,0.018854,0.0,1.0,0.0,0.02303,0.0,0.0,0.0,0.034487,...,0.0,0.066821,0.023365,0.0,0.050748,0.029704,0.018135,0.0,0.0,0.0
4,0.006865,0.009756,0.018351,0.0,1.0,0.005562,0.0,0.089357,0.0,0.0,...,0.025018,0.025016,0.039525,0.0,0.025676,0.0,0.0454,0.005151,0.058819,0.015761


## Get List of Movie Titles

In [None]:
movies_list = df['title'].values
movies_list, movies_list.shape

(array(['Minions', 'Interstellar', 'Deadpool', ..., 'Penitentiary',
        'Alien Zone', 'America Is Still the Place'], dtype=object),
 (4800,))

## Find Top Similar Movies for a Sample Movie

Let's take __Minions__ the most popular movie the the dataframe above and try and find the most similar movies which can be recommended

#### Find movie ID for 'Minions'

In [None]:
movie_idx = df[df['title'] == 'Minions'].index[0]
movie_idx

np.int64(0)

#### Get movie similarities

With the ID for the minions movie, select the row of the Minions movie from the cosine similarity dataframe --> this row is a vector of similarities between Minions and **all other movies**

In [None]:
movie_similarities = doc_sim_df[movie_idx]
movie_similarities

Unnamed: 0,0
0,1.000000
1,0.000000
2,0.000000
3,0.000000
4,0.006865
...,...
4795,0.000000
4796,0.000000
4797,0.000000
4798,0.000000


#### Get IDS of top 5 similar movies

Tip: use numpy functionalities. Do not include Minions movie itself.

In [None]:
similar_movie_idxs = [doc_sim_df[movie_idx].nlargest(6).index[i] for i in range(1,6)]
similar_movie_idxs

[np.int64(33), np.int64(60), np.int64(1212), np.int64(4083), np.int64(737)]

#### Get names of top 5 similar movies

In [None]:
similar_movies = [df.iloc[i]['title'] for i in similar_movie_idxs]
similar_movies

['Despicable Me 2',
 'Despicable Me',
 'Stuart Little 2',
 'Darling Companion',
 'Teenage Mutant Ninja Turtles: Out of the Shadows']

# Build a movie recommender function to recommend top 5 similar movies for any movie


In [None]:
def movie_recommender(movie_title, movies=movies_list, doc_sims=doc_sim_df):
    # find movie id
    movie_idx = df[df['title'] == movie_title].index[0]
    # get movie similarities
    movie_similarities = doc_sim_df[movie_idx]
    # get top 5 similar movie IDs
    similar_movie_idxs = [doc_sim_df[movie_idx].nlargest(6).index[i] for i in range(1,6)]
    # get top 5 movies
    similar_movies = [df.iloc[i]['title'] for i in similar_movie_idxs]
    # return the top 5 movies
    return similar_movies

Get popular Movie Recommendations

In [None]:
popular_movies = ['Minions', 'Interstellar', 'Deadpool', 'Jurassic World', 'Pirates of the Caribbean: The Curse of the Black Pearl',
              'Dawn of the Planet of the Apes', 'The Hunger Games: Mockingjay - Part 1', 'Terminator Genisys',
              'Captain America: Civil War', 'The Dark Knight', 'The Martian', 'Batman v Superman: Dawn of Justice',
              'Pulp Fiction', 'The Godfather', 'The Shawshank Redemption', 'The Lord of the Rings: The Fellowship of the Ring',
              'Harry Potter and the Chamber of Secrets', 'Star Wars', 'The Hobbit: The Battle of the Five Armies',
              'Iron Man']

In [None]:
for movie in popular_movies:
    print('Movie:', movie)
    print('Top 5 recommended Movies:', movie_recommender(movie_title=movie, movies=movies_list, doc_sims=doc_sim_df))
    print()

Movie: Minions
Top 5 recommended Movies: ['Despicable Me 2', 'Despicable Me', 'Stuart Little 2', 'Darling Companion', 'Teenage Mutant Ninja Turtles: Out of the Shadows']

Movie: Interstellar
Top 5 recommended Movies: ['Space Pirate Captain Harlock', 'Prometheus', 'Starship Troopers', 'Gattaca', 'Space Cowboys']

Movie: Deadpool
Top 5 recommended Movies: ['Shaft', 'Rabbit Hole', 'X-Men Origins: Wolverine', 'Victor Frankenstein', 'Underworld: Evolution']

Movie: Jurassic World
Top 5 recommended Movies: ['Jurassic Park', 'The Nut Job', 'The Lost World: Jurassic Park', "National Lampoon's Vacation", 'Vacation']

Movie: Pirates of the Caribbean: The Curse of the Black Pearl
Top 5 recommended Movies: ['Pirates of the Caribbean: On Stranger Tides', 'The Pirate', "Pirates of the Caribbean: Dead Man's Chest", 'The Pirates! In an Adventure with Scientists!', 'Space Pirate Captain Harlock']

Movie: Dawn of the Planet of the Apes
Top 5 recommended Movies: ['Battle for the Planet of the Apes', 'Gro

# Movie Recommendation with Embeddings

 The FastText model considers each word as a Bag of Character n-grams. This is also called as a subword model in the paper.

We add special boundary symbols < and > at the beginning and end of words. This enables us to distinguish prefixes and suffixes from other character sequences. We also include the word w itself in the set of its n-grams, to learn a representation for each word (in addition to its character n-grams). Taking the word `where` and n=3 (tri-grams) as an example, it will be represented by the character n-grams: `<wh, whe, her, ere, re>` and the special sequence `<where>` representing the whole word. Note that the sequence , corresponding to the word `<her>` is different from the tri-gram `her` from the word `where`.

Here we leverage `gensim` to build our embeddings

## Build the FastText embedding model here

Remember more the iterations usually better the embeddings but the more time it will take depending on your system CPU

50 iterations might take 15-20 mins

### GET HELP HERE: LIVE CODING 4 EMBEDDING MODELS

In [None]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import gensim
from gensim.models import FastText

tokenized_docs = [norm_corpus[i].split() for i in range(len(norm_corpus))]
# ideal config params size: 300, window: 30, min_count=2 or more, iter=50 or more (use 10 if it takes too much time)
ft_model = FastText(tokenized_docs, vector_size=300, window=30, min_count=2, epochs=50)

## Generate document level embeddings

Word embedding models give us an embedding for each word, average all word embeddings for words in a document and generate a fixed-length document level embedding for ML/DL tasks

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    """
    Calculate the average word vector for a single sentence.

    Args:
    - words: List of words (tokens) in the document.
    - model: Pre-trained word vector model (like Word2Vec).
    - vocabulary: Set of words that are in the model's vocabulary.
    - num_features: Number of features (dimensions of the word vectors).

    Returns:
    - feature_vector: Average word vector for the input words.
    """
    feature_vector = np.zeros((num_features,), dtype="float64")  # Initialize feature vector
    nwords = 0.  # Count of words contributing to the average

    for word in words:
        if word in vocabulary:  # Only process words in the model's vocabulary
            nwords += 1  # Increment word count
            feature_vector += model.wv[word]  # Sum the word vectors

    if nwords > 0:
        feature_vector /= nwords  # Divide by the number of words to get the average

    return feature_vector


def averaged_word_vectorizer(corpus, model):
    """
    Create average word vectors for each document in the corpus.

    Args:
    - corpus: List of tokenized sentences (each sentence is a list of words).
    - model: Pre-trained word vector model (like Word2Vec).

    Returns:
    - features: Array of average word vectors for each document.
    """
    vocabulary = set(model.wv.index_to_key)  # Set of unique words in the model's vocabulary
    num_features = model.wv.vector_size  # Get the number of features for the word vectors
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]  # Compute feature vectors for each sentence
    return np.array(features)  # Return the result as a Numpy array

In [None]:
from gensim.models import word2vec

doc_vecs_ft = averaged_word_vectorizer(tokenized_docs, ft_model)
doc_vecs_ft.shape

(4800, 300)

## Get Movie Recommendations

We will leverage cosine similarity again to generate recommendations

In [None]:
doc_sim = cosine_similarity(doc_vecs_ft)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
0,1.0,-0.084741,0.020334,-0.200349,0.120167,-0.088189,0.281437,0.036801,0.084238,0.411134,...,0.319934,-0.117667,0.416158,0.307144,-0.125354,0.244295,0.236837,0.313998,0.209115,0.300784
1,-0.084741,1.0,0.406414,0.311343,0.329197,0.308626,0.125967,0.51643,0.057209,0.231598,...,0.00163,0.024845,0.182465,-0.096399,0.034125,-0.022995,-0.093175,-0.089651,0.031287,0.253836
2,0.020334,0.406414,1.0,0.300918,0.379465,0.017016,0.128263,0.437986,0.134841,0.116871,...,0.127309,0.180864,-0.022625,-0.141537,0.212874,-0.043094,0.329367,0.086255,0.251729,0.302082
3,-0.200349,0.311343,0.300918,1.0,0.172453,0.439421,-0.051572,0.115954,0.17655,-0.042833,...,0.120551,0.348162,0.038987,-0.165137,0.356629,-0.0103,0.183442,0.154099,0.088095,0.200235
4,0.120167,0.329197,0.379465,0.172453,1.0,-0.163417,0.208654,0.406327,0.053439,-0.180663,...,0.43287,0.26132,0.366544,-0.107299,0.413648,-0.152118,0.559825,0.238364,0.693851,0.537877


In [None]:
for movie in popular_movies:
    print('Movie:', movie)
    print('Top 5 recommended Movies:', movie_recommender(movie_title=movie, movies=movies_list, doc_sims=doc_sim_df))
    print()

Movie: Minions
Top 5 recommended Movies: ['Paul Blart: Mall Cop', 'Furry Vengeance', 'Shadow Conspiracy', 'Despicable Me', 'The Astronaut Farmer']

Movie: Interstellar
Top 5 recommended Movies: ['The Inhabited Island', 'Sea Rex 3D: Journey to a Prehistoric World', 'Alien', 'Antarctic Edge: 70° South', 'The Abyss']

Movie: Deadpool
Top 5 recommended Movies: ['Lucy', "The Caveman's Valentine", 'The Missing Person', 'The Men Who Stare at Goats', 'Iron Man 3']

Movie: Jurassic World
Top 5 recommended Movies: ['Jurassic Park', 'Walking With Dinosaurs', 'The Case of the Grinning Cat', 'One Day', 'Paul']

Movie: Pirates of the Caribbean: The Curse of the Black Pearl
Top 5 recommended Movies: ['Tycoon', 'Jungle Shuffle', 'Django Unchained', 'Troy', 'Khumba']

Movie: Dawn of the Planet of the Apes
Top 5 recommended Movies: ['Battle for the Planet of the Apes', 'The Darkest Hour', 'Soldier', 'Priest', 'Damnation Alley']

Movie: The Hunger Games: Mockingjay - Part 1
Top 5 recommended Movies: ['Ge