In [None]:
doc = """
        Despite its widespread lack of familiarity, 
        AI is a technology that is transforming every walk of life. 
        It is a wide-ranging tool that enables people to rethink how we integrate information, 
        analyze data, and use the resulting insights to improve decisionmaking. Our hope through 
        this comprehensive overview is to explain AI to an audience of policymakers, opinion leaders,
        and interested observers, and demonstrate how AI already is altering the world and raising important
        questions for society, the economy, and governance.

In this paper, we discuss novel applications in finance, national security, health care, 
criminal justice, transportation, and smart cities, and address issues such as data access problems,
algorithmic bias, AI ethics and transparency, and legal liability for AI decisions. We contrast the 
regulatory approaches of the U.S. and European Union, and close by making a number of recommendations 
for getting the most out of AI while still protecting important human values.
      """

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
#The maximum sum distance between pairs of data is defined as the pairs of
#data for which the distance between them is maximized. 
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
max_sum_sim(doc_embedding, word_embeddings, words, top_n=5, nr_candidates=10)

In [None]:
#MMR tries to minimize redundancy and maximize the diversity of results in text summarization tasks.
import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [None]:
mmr(doc_embedding, word_embeddings, words, top_n=5, diversity=0.7)
print(keywords)
mmr(doc_embedding, word_embeddings, words, top_n=5, diversity=0.2)
print(keywords)