In [1]:
import fasttext
import fasttext.util
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('../data/filtered_nodes.pkl')

In [3]:
title_list = list(df['title'])
abstract_list = list(df['abstract'])
keywords_list = list(df['keywords'])

# fasttext

## To download fasttext pretrained model

In [4]:
# fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('cc.en.300.bin')



## embedding function

In [5]:
def word_embedding(model, keywords_list):
    embedding = [ np.array(list(map(model.get_word_vector, keywords))) for keywords in keywords_list ]
    return embedding

def sentence_embedding(model, sentence_list):
    embedding = [ model.get_sentence_vector(' '.join(sentence.split())) for sentence in sentence_list ]
    return embedding

In [6]:
keywords_embedding_list = word_embedding(model, keywords_list)

In [7]:
keywords_embedding_mean_list = [ np.mean(keywords_embedding, axis=0) for keywords_embedding in keywords_embedding_list]

In [8]:
title_embedding_list = sentence_embedding(model, title_list)

# BERT

In [17]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
model_name = 'bert-base-uncased'  # You can choose other BERT variations as well
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [24]:
def get_bert_embedding(text, bert_tokenizer, bert_model, top_n=20):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)  # Adjust max_length as needed

    with torch.no_grad():
        outputs = bert_model(**inputs)
        embeddings = outputs.last_hidden_state
        
        
    # Rank tokens based on the norm of embeddings
    norms = torch.norm(embeddings, p=2, dim=2)
    ranked_indices = norms.argsort(descending=True).squeeze(0)
    ranked_tokens = [ bert_tokenizer.convert_ids_to_tokens(idx.item()) for idx in ranked_indices ]
    
    # Filtering and post-processing
    keywords = []
    for token in ranked_tokens:
        if token.startswith("##"):
            continue
        keywords.append(token)
        if len(keywords) == top_n:
            break
    
    return keywords

In [27]:
# get_bert_embedding(abstract_list[0], tokenizer, model)
abstract_keywords_list = [ get_bert_embedding(abstract, tokenizer, model) for abstract in abstract_list ]

In [33]:
# print(len(abstract_keywords_list))
# print(abstract_keywords_list[555])
# abstract_embedding_list = word_embedding(abstract_keywords_list)


get_bert_embedding(abstract_list[0], tokenizer, model)

148061
['[unused37]', '[unused79]', '[unused80]', '[unused11]', '[unused55]', '[unused54]', '[unused67]', '[unused56]', '[unused10]', '[unused64]', '[unused52]', '[unused53]', '[unused39]', '[unused66]', '[unused68]', '[unused38]', '[unused46]', '[unused174]', '[unused96]', '[unused65]']


['[unused129]',
 '[unused94]',
 '[unused130]',
 '[unused110]',
 '[unused100]',
 '[unused111]',
 '[unused131]',
 '[unused154]',
 '[unused84]',
 '[unused38]',
 '[unused95]',
 '[unused171]',
 '[unused2]',
 '[unused156]',
 '[unused102]',
 '[unused175]',
 '[unused112]',
 '[unused23]',
 '[unused128]',
 '[MASK]']

## KeyBERT

In [34]:
from keybert import KeyBERT

In [55]:
import numpy as np

keywords_ls = []
means = []
kw_model = KeyBERT()

for idx, abstract in enumerate(abstract_list[:100]):
    keywords = kw_model.extract_keywords(abstract, top_n=20, use_mmr=True)
    keywords_ls.append(keywords)
    
    if idx % 10 == 0:
        print(f"[{idx}] {keywords}")
        
    scores = list(map(float, np.array(keywords)[:, 1]))
    mean_score = np.median(scores)
    means.append(mean_score)
    
np.median(means)

[0] [('optimization', 0.3496), ('fuzzy', 0.3013), ('optimal', 0.2961), ('objective', 0.274), ('algorithm', 0.2718), ('objectives', 0.2717), ('facilities', 0.2028), ('station', 0.1872), ('locations', 0.1841), ('location', 0.181), ('stations', 0.181), ('areas', 0.1798), ('combination', 0.1795), ('decision', 0.1747), ('multi', 0.1639), ('genetic', 0.1609), ('derbyshire', 0.1426), ('risk', 0.1421), ('application', 0.1392), ('determine', 0.1337)]
[10] [('tensors', 0.4823), ('tensor', 0.4633), ('hyperspectral', 0.4504), ('spectral', 0.396), ('classification', 0.379), ('dimensionality', 0.3598), ('locality', 0.3264), ('supervised', 0.3139), ('discriminative', 0.2908), ('features', 0.2783), ('multilinear', 0.2643), ('imagery', 0.2526), ('learning', 0.2205), ('manifold', 0.2187), ('spectrometer', 0.2178), ('feature', 0.212), ('spatial', 0.2065), ('infrared', 0.2036), ('images', 0.1914), ('imaging', 0.1804)]
[20] [('flow', 0.4234), ('motion', 0.421), ('tracking', 0.4194), ('segmenter', 0.3624), 

0.240975

In [81]:
keywords_from_abstract = []
threshold = 0.3
kw_model = KeyBERT()

for abstract in abstract_list:
    keywords = kw_model.extract_keywords(abstract, top_n=20, use_mmr=True)
    filtered_keywords = [ keyword[0] for keyword in keywords if keyword[1] > threshold ]
    keywords_from_abstract.append(filtered_keywords)

[0] ['optimization', 'fuzzy']
[10] ['tensors', 'hyperspectral', 'spectral', 'classification', 'dimensionality', 'locality']
[20] ['flow', 'motion', 'tracking', 'segmenter', 'background']
[30] ['recognition', 'volumetric', 'decomposition']
[40] ['saliency', 'carving']
[50] ['segmentation', 'videos', 'surveillance', 'frames', 'descriptor', 'clip']
[60] ['equalizer', 'interference', 'channels']
[70] ['gene', 'networks', 'digraph']
[80] ['geostatistics', 'gstat', 'gis', 'variogram']
[90] ['neural', 'computational', 'dimensional', 'recurrent', 'iterated', 'networks', 'piecewise', 'universal', 'linear']


In [82]:
import numpy as np
from joblib import Parallel, delayed

def process_abstract(idx, abstract, kw_model, threshold):
    keywords = kw_model.extract_keywords(abstract, top_n=20, use_mmr=True)
    filtered_keywords = [keyword[0] for keyword in keywords if keyword[1] > threshold]
    
    return idx, filtered_keywords

threshold = 0.25
kw_model = KeyBERT()
results = Parallel(n_jobs=-1)(delayed(process_abstract)(idx, abstract, kw_model, threshold) for idx, abstract in enumerate(abstract_list))

keywords_ls = []

for idx, filtered_keywords in results:
    keywords_ls.append(filtered_keywords)
    
    if idx % 10 == 0:
        print(f"[{idx}] {filtered_keywords}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [69]:
np.median(means)
np.quantile(means, 0.9)

0.30981499999999995

In [None]:
rand = 3232
similarity = np.dot(keywords_embedding_mean_list[rand], title_embedding_list[rand]) / (np.linalg.norm(keywords_embedding_mean_list[rand]) * np.linalg.norm(title_embedding_list[rand]))
similarity

0.5314725

In [None]:
similarity = np.dot(abstract_embedding_list[rand], title_embedding_list[rand]) / (np.linalg.norm(abstract_embedding_list[rand]) * np.linalg.norm(title_embedding_list[rand]))
similarity

In [None]:
similarity = np.dot(abstract_embedding_list[rand], keywords_embedding_mean_list[rand]) / (np.linalg.norm(abstract_embedding_list[rand]) * np.linalg.norm(keywords_embedding_mean_list[rand]))
similarity