In [38]:
with open('text.txt','r',encoding='utf-8') as file:
    full_text = file.read()

In [39]:
import yake
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keywords = kw_extractor.extract_keywords(full_text)
for kw, v in keywords:
  print("Keyphrase: ",kw, ": score", v)

Keyphrase:  Oraichain Academy : score 0.00509403510229188
Keyphrase:  Oraichain : score 0.012661539516886469
Keyphrase:  Oraichain Academy journal : score 0.0216785908371288
Keyphrase:  Academy : score 0.024160619431243494
Keyphrase:  managed Oraichain Team : score 0.02622825036167718
Keyphrase:  managed Oraichain : score 0.049338869717212734
Keyphrase:  distribute Oraichain : score 0.049338869717212734
Keyphrase:  Oraichain Team : score 0.049750667762045064
Keyphrase:  Organized and managed : score 0.05352915320746283
Keyphrase:  technology : score 0.05430182605321137


In [40]:
from multi_rake import Rake
rake = Rake()
keywords = rake.apply(full_text)
print(keywords[:10])

[('create good content', 8.5), ('gain valuable knowledge', 8.0), ('main topic categories', 8.0), ('system design decisions', 8.0), ('managed oraichain team', 7.642857142857142), ('distribute oraichain’s research', 7.5), ('communicate oraichain’s technology', 6.571428571428571), ('important crypto events', 6.333333333333334), ('oraichain academy journal', 6.267857142857142), ('oraichain team', 4.642857142857142)]


In [48]:
from summa import keywords
TR_keywords = keywords.keywords(full_text, scores=True)
print(TR_keywords[0:10])

[('oraichain', 0.4148995819705745), ('technology', 0.20197149537045767), ('technologies', 0.20197149537045767), ('topics', 0.17754749033140257), ('topic', 0.17754749033140257), ('crypto', 0.1666449094774695), ('good', 0.1552571681022481), ('research', 0.1497464347703595), ('researchers', 0.1497464347703595), ('effects', 0.1461832029297981)]


In [42]:
from keybert import KeyBERT
kw_model = KeyBERT(model='all-mpnet-base-v2')
keywords = kw_model.extract_keywords(full_text, 

                                     keyphrase_ngram_range=(1, 3), 

                                     stop_words='english',

                                     highlight=False,

                                     top_n=10)

keywords_list= list(dict(keywords).keys())
print(keywords_list)

['contribution oraichain ecosystem', 'oraichain research views', 'oraichain technology articles', 'oraichain technology', 'communicate oraichain technology', 'oracle blockchain ecosystem', 'oraichain ecosystem opinions', 'announcing purposes oraichain', 'valuable knowledge oraichain', 'distribute oraichain research']


In [43]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([full_text])
candidates = count.get_feature_names()

In [44]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([full_text])
candidate_embeddings = model.encode(candidates)

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [46]:
keywords

['innovation', 'researchers', 'scientists', 'blog', 'online']

In [47]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]