Pick a Plot

In [1]:
import pandas as pd

In [4]:
movie_plots_file = 'movie_plots.csv'

df = pd.read_csv(movie_plots_file, encoding='utf-8')
df.drop_duplicates(subset='url', inplace=True, keep=False)
df.head()

Unnamed: 0.1,Unnamed: 0,title,url,plot
0,0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,\nJonathan Rivers is an architect and lives wi...
1,1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"\nKen Carter lives in Richmond, California. He..."
2,2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"\nAfter being killed in Daredevil, Elektra Nat..."
3,3,Racing Stripes,https://en.wikipedia.org/wiki/Racing_Stripes,"\nDuring a thunderstorm, a traveling circus, C..."
4,4,Tom and Jerry: Blast Off to Mars,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,\nTom (voiced by Bill Kopp) chases Jerry as us...


In [None]:
movie_plot = df[df['title'].str.match('Interstellar')].iloc[0]['plot']
movie_plot

Extract Keywords

In [None]:
!python -m spacy download en_core_web_md
import spacy
from spacy import displacy

In [None]:
nlp = spacy.load('en_core_web_md')

def get_noun_phrases(text):

    candidates = []
    doc = nlp(text)
    
    for np in doc.noun_chunks:
        phrase = np.text.strip()
        if phrase not in candidates:
            candidates.append(phrase)
    
    return candidates

print (get_noun_phrases(movie_plot))

In [9]:
np_movie = get_noun_phrases(movie_plot)
print(len(np_movie))

167


In [10]:
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L12-v2')

def get_doc_and_candidate_embeddings(model, doc, candidates):
    doc_embedding = model.encode([doc])
    candidate_embedding = model.encode(candidates)
    return doc_embedding, candidate_embedding

In [12]:
doc_embeds, keyphrase_embeds = get_doc_and_candidate_embeddings(model, movie_plot, np_movie)

In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
keyphrase_doc_similarity = cosine_similarity(keyphrase_embeds, doc_embeds).flatten()
keyphrase_doc_similarity.shape

(167,)

In [None]:
keywords_idx = np.argsort(keyphrase_doc_similarity)[::-1]

print(keyphrase_doc_similarity[keywords_idx])

In [None]:
sorted_keywords = [np_movie[i] for i in keywords_idx]
print(sorted_keywords)

In [23]:
from typing import List, Tuple
import itertools
from operator import itemgetter

In [25]:
# https://maartengr.github.io/KeyBERT/api/mmr.html#keybert._model.KeyBERT.extract_keywords
def mmr(
    doc_embedding: np.ndarray,
    word_embeddings: np.ndarray,
    words: List[str],
    top_n: int = 5,
    diversity: float = 0.8,
) -> List[Tuple[str, float]]:
    """Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.


    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.

    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        diversity: How diverse the select keywords/keyphrases are.
                   Values between 0 and 1 with 0 being not diverse at all
                   and 1 being most diverse.

    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances

    """

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(min(top_n - 1, len(words) - 1)):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(
            word_similarity[candidates_idx][:, keywords_idx], axis=1
        )

        # Calculate MMR
        mmr = (
            1 - diversity
        ) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    # Extract and sort keywords in descending similarity
    keywords = [
        (words[idx], round(float(word_doc_similarity.reshape(1, -1)[0][idx]), 4))
        for idx in keywords_idx
    ]
    keywords = sorted(keywords, key=itemgetter(1), reverse=True)
    return keywords

In [28]:
mmr_keywords = mmr(doc_embeds, keyphrase_embeds, np_movie, top_n=5, diversity=0.8)
print(mmr_keywords)

[('-NASA pilot Cooper', 0.5708), ('Murph scientific data', 0.2713), ('blight', 0.2173), ('Government-run schools', 0.2145), ('massive tidal waves', 0.0725)]


In [29]:
def max_sum_distance(
    doc_embedding: np.ndarray,
    word_embeddings: np.ndarray,
    words: List[str],
    top_n: int,
    nr_candidates: int,
) -> List[Tuple[str, float]]:
    """Calculate Max Sum Distance for extraction of keywords
    We take the 2 x top_n most similar words/phrases to the document.
    Then, we take all top_n combinations from the 2 x top_n words and
    extract the combination that are the least similar to each other
    by cosine similarity.
    This is O(n^2) and therefore not advised if you use a large `top_n`
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        nr_candidates: The number of candidates to consider
    Returns:
         List[Tuple[str, float]]: The selected keywords/keyphrases with their distances
    """
    if nr_candidates < top_n:
        raise Exception(
            "Make sure that the number of candidates exceeds the number "
            "of keywords to return."
        )
    elif top_n > len(words):
        return []

    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, word_embeddings)
    distances_words = cosine_similarity(word_embeddings, word_embeddings)

    # Get 2*top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [words[index] for index in words_idx]
    candidates = distances_words[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = 100_000
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum(
            [candidates[i][j] for i in combination for j in combination if i != j]
        )
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [
        (words_vals[idx], round(float(distances[0][words_idx[idx]]), 4))
        for idx in candidate
    ]

In [30]:
maxsum_keywords = max_sum_distance(doc_embeds, keyphrase_embeds, np_movie, top_n=5, nr_candidates=10)
print(maxsum_keywords)

[('Cooper', 0.368), ('the previous NASA explorer', 0.3908), ('NASA volunteers', 0.4211), ('a global famine', 0.4284), ('The famine', 0.463)]


Find Similar Documents

In [32]:
all_plots = df['plot'].tolist()
batmanbegins_plot = df[df['title'].str.match('Batman Begins')].iloc[0]['plot']

In [33]:
batmanbegins_embedd, all_plots_embeds = \
    get_doc_and_candidate_embeddings(model, batmanbegins_plot, all_plots)

In [37]:
plot_similarity = cosine_similarity(batmanbegins_embedd, all_plots_embeds).flatten()
plots_idx = np.argsort(plot_similarity)[::-1]

sorted_titles = [df.iloc[idx]['title'] for idx in plots_idx]

print(sorted_titles[:10])

['Batman Begins', 'The Dark Knight', 'The Lego Batman Movie', 'The Batman vs. Dracula', 'Batman: The Killing Joke', 'Birds of Prey', 'Joker', 'Killshot', 'All Eyez on Me', 'Setup']
