In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [50]:
doc = """
         
Enjoy immersive sound effects with SONY WH-1000XM5 Bluetooth Headset with Mic. Packed with a host of features, this pair of headphones is designed with Bluetooth 5.2 support for unparalleled performance and significant stability. Thanks to its wireless feature, now you do not need to untangle your headphones every time you want to listen to songs. Furthermore, it comes with Auto Noise Canceling Optimizer which monitors the noise canceling performance by pursuing the wearing conditions of the headphones. The exclusively designed carrying case makes this gadget travel-friendly. Thinking about its outstanding features, the price of the SONY WH-1000XM5 Bluetooth Headset with Mic seems quite fair.

 

Good music and smiling are contagious, these can effectively elevate your mood. Say goodbye to stress with this SONY WH-1000XM5 Bluetooth Headset with Mic. Its exceptional clarity and intense sound effects make these headphones an absolute favorite of all music lovers. You can simply connect this pair of headphones to your phone, computer, tablet, or mp3 player. The packaging includes operating instructions for additional convenience. In a word, the SONY WH-1000XM5 Bluetooth Headset with Mic is a blend of sleek design and cutting-edge technology. So, what are you waiting for? Get the SONY WH-1000XM5 Bluetooth Headset with Mic online, now!
      """

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

# n_gram_range = (1, 1)
n_gram_range = (1, 3)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()

In [52]:
candidates

array(['1000xm5', '1000xm5 bluetooth', '1000xm5 bluetooth headset',
       'absolute', 'absolute favorite', 'absolute favorite music',
       'additional', 'additional convenience',
       'additional convenience word', 'auto', 'auto noise',
       'auto noise canceling', 'blend', 'blend sleek',
       'blend sleek design', 'bluetooth', 'bluetooth headset',
       'bluetooth headset mic', 'bluetooth support',
       'bluetooth support unparalleled', 'canceling',
       'canceling optimizer', 'canceling optimizer monitors',
       'canceling performance', 'canceling performance pursuing',
       'carrying', 'carrying case', 'carrying case makes', 'case',
       'case makes', 'case makes gadget', 'clarity', 'clarity intense',
       'clarity intense sound', 'comes', 'comes auto', 'comes auto noise',
       'computer', 'computer tablet', 'computer tablet mp3', 'conditions',
       'conditions headphones', 'conditions headphones exclusively',
       'connect', 'connect pair', 'connect pair

In [53]:

doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 50
distances = cosine_similarity(doc_embedding, candidate_embeddings)
# keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [55]:
keywords

['headset mic exceptional',
 'headphones time want',
 'phone computer tablet',
 'make headphones',
 'need untangle headphones',
 'immersive sound effects',
 'enjoy immersive sound',
 'conditions headphones exclusively',
 'good music',
 'headphones exclusively designed',
 'connect pair headphones',
 'mp3 player packaging',
 'wearing conditions headphones',
 'monitors noise canceling',
 'optimizer monitors noise',
 'computer tablet mp3',
 'headset mic online',
 'favorite music',
 'fair good music',
 'absolute favorite music',
 'convenience word sony',
 'headphones phone computer',
 'music smiling contagious',
 'wireless feature need',
 'headset mic blend',
 'wireless feature',
 'gadget travel friendly',
 'favorite music lovers',
 'effects make headphones',
 'sony wh 1000xm5',
 'good music smiling',
 'tablet mp3 player',
 'make headphones absolute',
 'technology waiting sony',
 'headphones absolute favorite',
 'sound effects sony',
 'bluetooth',
 'thanks wireless',
 'thanks wireless featu

In [56]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [57]:
max_sum_sim(
    doc_embedding=doc_embedding,
  word_embeddings=  candidate_embeddings,
    words=candidates,
    top_n=10,
    nr_candidates= 20)

['good music smiling',
 'tablet mp3 player',
 'make headphones absolute',
 'technology waiting sony',
 'headphones absolute favorite',
 'sound effects sony',
 'bluetooth',
 'stability thanks wireless',
 'wh 1000xm5 bluetooth',
 '1000xm5 bluetooth']

In [58]:
import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [59]:
mmr(
    doc_embedding=doc_embedding,
  word_embeddings=  candidate_embeddings,
    words=candidates,
    top_n=20,
    diversity= .5)

['1000xm5 bluetooth headset',
 'favorite music lovers',
 'monitors noise canceling',
 'stability thanks wireless',
 'technology waiting sony',
 'designed bluetooth support',
 'headphones exclusively designed',
 'tablet mp3 player',
 'bluetooth support unparalleled',
 'headphones designed bluetooth',
 'player packaging includes',
 'headset mic blend',
 'bluetooth headset mic',
 'sound effects sony',
 'convenience word sony',
 'bluetooth support',
 'sony wh 1000xm5',
 'wh 1000xm5 bluetooth',
 'headphones absolute favorite',
 '1000xm5 bluetooth']

In [1]:
import spacy
from thinc.api import set_gpu_allocator, require_gpu

# Use the GPU, with memory allocations directed via PyTorch.
# This prevents out-of-memory errors that would otherwise occur from competing
# memory pools.
set_gpu_allocator("pytorch")
require_gpu(0)

nlp = spacy.load("en_core_web_trf")
for doc in nlp.pipe(["some text", "some other text"]):
    tokvecs = doc._.trf_data.tensors[-1]

ValueError: Cannot use GPU, CuPy is not installed