In [49]:
with open('text.txt','r',encoding='utf-8') as file:
    full_text = file.read()

In [50]:
import yake
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keywords = kw_extractor.extract_keywords(full_text)
for kw, v in keywords:
  print("Keyphrase: ",kw, ": score", v)

Keyphrase:  Proof of Stake : score 0.0037218814086997045
Keyphrase:  Internet Age : score 0.004807572690886256
Keyphrase:  Proof of Work : score 0.005131495982340011
Keyphrase:  Proof : score 0.008464898304442928
Keyphrase:  blockchain : score 0.015710137894152452
Keyphrase:  global trending headlines : score 0.016165480365113997
Keyphrase:  timeline of Internet : score 0.016879318306396683
Keyphrase:  Delegated Proof : score 0.01707220240951316
Keyphrase:  Stake : score 0.020316235727219162
Keyphrase:  validators : score 0.022194780558451727


In [51]:
from multi_rake import Rake
rake = Rake()
keywords = rake.apply(full_text)
print(keywords[:10])

[('global trending headlines', 9.0), ('controversial talking point', 9.0), ('projects started rising', 9.0), ('promising effort-reward trade-off', 9.0), ('incremental holdings advocate', 9.0), ('strategic long-term vision', 9.0), ('oraichain’s all-important decision', 8.666666666666666), ('digital cash system', 8.5), ('newly mined tokens', 8.428571428571429), ('owns governance tokens', 8.428571428571429)]


In [52]:
from summa import keywords
TR_keywords = keywords.keywords(full_text, scores=True)
print(TR_keywords[0:10])

[('validators', 0.22505974620611613), ('validator', 0.22505974620611613), ('validate', 0.22505974620611613), ('network', 0.19432832205840733), ('networks', 0.19432832205840733), ('blockchain', 0.18265259880646112), ('blockchains', 0.18265259880646112), ('transactions', 0.1785068786788457), ('transaction', 0.1785068786788457), ('mechanisms', 0.16482030487051402)]


In [53]:
from keybert import KeyBERT
kw_model = KeyBERT(model='all-mpnet-base-v2')
keywords = kw_model.extract_keywords(full_text, 

                                     keyphrase_ngram_range=(1, 3), 

                                     stop_words='english',

                                     highlight=False,

                                     top_n=10) 

keywords_list= list(dict(keywords).keys())
print(keywords_list)

['blockchain validators', 'validators blockchain networks', 'validators blockchain', 'distributed ledger', 'blockchain validators staked', 'blockchain validators fact', 'participating blockchain validators', 'blockchains', 'block distributed ledger', 'importance validators blockchain']


In [43]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([full_text])
candidates = count.get_feature_names()

In [44]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([full_text])
candidate_embeddings = model.encode(candidates)

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [46]:
keywords

['innovation', 'researchers', 'scientists', 'blog', 'online']

In [47]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]