In [3]:
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
from transformers import DistilBertModel, DistilBertConfig
from keybert import KeyBERT 
# https://maartengr.github.io/KeyBERT/guides/embeddings.html


In [10]:
doc = """
         Social cognition concerns the various psychological processes that enable individuals to take advantage of being part of a social group. Of major importance to social cognition are the various social signals that enable us to learn about the world. Such signals include facial expressions, such as fear and disgust, which warn us of danger, and eye gaze direction, which indicate where interesting things can be found. Such signals are particularly important in infant development. Social referencing, for example, refers to the phenomenon in which infants refer to their mothers' facial expressions to determine whether or not to approach a novel object. We can learn a great deal simply by observing others. Much of this signalling seems to happen automatically and unconsciously on the part of both the sender and the receiver. We can learn to fear a stimulus by observing the response of another, in the absence of awareness of that stimulus. By contrast, learning by instruction, rather than observation, does seem to depend upon awareness of the stimulus, since such learning does not generalize to situations where the stimulus is presented subliminally. Learning by instruction depends upon a meta-cognitive process through which both the sender and the receiver recognize that signals are intended to be signals. An example would be the ‘ostensive’ signals that indicate that what follows are intentional communications. Infants learn more from signals that they recognize to be instructive. I speculate that it is this ability to recognize and learn from instructions rather than mere observation which permitted that advanced ability to benefit from cultural learning that seems to be unique to the human race.
      """

## 1. extract key-phrases as candidates

In [6]:
# use countVectorization to split doc into candidate key words and phrases based on n-gram parameter

n_gram_range = (1,3) # set keywords range from 1 to 3
stop_words = "english" # remove stopping words from the doc

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()
print(candidates)

In [None]:
# alternative
# # the keyphrase-vectorizers pkg auto decide n-gram without need to specify

# Init default vectorizer.
vectorizer = KeyphraseCountVectorizer()

# Print parameters
# print(vectorizer.get_params())

# fit to learn keywords
fitted_vectorizer = vectorizer.fit([doc])

# # After learning the keyphrases, they can be returned.
candicates = fitted_vectorizer.get_feature_names()


## 2. select embedding models

In [17]:

# here we use distilbert model from sentencetransformer. It is lightweight.
# initiate model
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# embedding [what does embedding do? ]
doc_embedding = sentence_model.encode([doc])
candidate_embeddings = sentence_model.encode(candidates)

In [18]:
# compute cosine similarity - how similar are the candidate words? 
n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-n:]]

## 3. Max Sum Similarity

In [20]:
# To diversify the results, we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity.

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

## 4. choose code

In [38]:
vectorizer = KeyphraseCountVectorizer()
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
kw_model = KeyBERT(model=sentence_model)
kw_model.extract_keywords(doc, vectorizer=vectorizer,  top_n=5)

[('various social signals', 0.4425),
 ('social cognition', 0.4335),
 ('cultural learning', 0.4112),
 ('various psychological processes', 0.4108),
 ('social referencing', 0.3608)]