In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import os, json, re, argparse, math
from pathlib import Path
from collections import Counter
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import spacy.cli
spacy_model = 'en_core_web_sm'
spacy.cli.download(spacy_model)
spacy_exclude = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']
nlp = spacy.load(spacy_model, exclude=spacy_exclude)
timelogger = utils.TimeLogger()
"""
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
year={2022},
pages={243-248},
publisher={SciTePress},
organization={INSTICC},
doi={10.5220/0011546600003335},
isbn={978-989-758-614-9},
issn={2184-3228},
}
"""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m116.2 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 


'\n@conference{schopf_etal_kdir22,\nauthor={Tim Schopf and Simon Klimek and Florian Matthes},\ntitle={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},\nbooktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},\nyear={2022},\npages={243-248},\npublisher={SciTePress},\norganization={INSTICC},\ndoi={10.5220/0011546600003335},\nisbn={978-989-758-614-9},\nissn={2184-3228},\n}\n'

In [2]:
database_name = '02_stg'
table_name = 'stg_filtered_work_chapters_methodology_single'
id_columns = ['work_id']
text_column_name = 'chapter_text'
text_batch_size = 1000

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0][0:100]

'As was noted earlier, we formed our dataset utilizing the existing relations present in SNOMED CT. T'

In [4]:
from top2vec import Top2Vec

# Create a Contextual Top2Vec model
top2vec_model = Top2Vec(documents=texts_only_list,
                        ngram_vocab=True,
                        contextual_top2vec=True)

2025-09-13 20:00:43,190 - top2vec - INFO - Pre-processing documents for training
2025-09-13 20:00:44,283 - top2vec - INFO - Creating vocabulary embedding
Embedding vocabulary: 100%|██████████| 30/30 [00:04<00:00,  6.79it/s]
2025-09-13 20:00:49,394 - top2vec - INFO - Create contextualized document embeddings
Embedding documents: 100%|██████████| 32/32 [00:05<00:00,  6.27it/s]
1000it [00:00, 6228.96it/s]
2025-09-13 20:00:58,720 - top2vec - INFO - Creating lower dimension embedding of documents
2025-09-13 20:01:25,665 - top2vec - INFO - Finding dense areas of documents
2025-09-13 20:01:25,849 - top2vec - INFO - Finding topics
Smoothing document token embeddings: 100%|██████████| 1000/1000 [00:04<00:00, 239.10it/s]
Calculating document topic distributions: 100%|██████████| 1000/1000 [00:00<00:00, 3626.91it/s]


In [5]:
top2vec_model.get_num_topics()

88

In [6]:
top2vec_model.get_topic_sizes()

(array([12278, 10966,  9748,  7952,  7732,  7462,  6623,  6484,  6196,
         5744,  5673,  5604,  5354,  5303,  5177,  5165,  5115,  4913,
         4708,  4650,  4454,  4339,  4127,  3936,  3783,  3267,  3233,
         3104,  3082,  3080,  2949,  2917,  2851,  2814,  2697,  2685,
         2600,  2566,  2561,  2471,  2471,  2410,  2299,  2271,  2253,
         2180,  2175,  2170,  2112,  2106,  2100,  2037,  1986,  1955,
         1921,  1920,  1815,  1793,  1764,  1700,  1684,  1657,  1657,
         1637,  1627,  1539,  1533,  1486,  1480,  1476,  1451,  1399,
         1365,  1357,  1343,  1343,  1148,  1136,  1068,  1062,  1050,
          988,   968,   853,   821,   781,   769,   744]),
 array([79, 53, 75,  0, 76, 85, 82, 62, 47, 51, 44, 29, 80, 74, 63, 84, 57,
        20, 55, 35, 70, 73, 13, 24, 68,  3, 37, 14,  1, 58, 87, 18, 46, 72,
        15, 81, 27, 56, 38, 11, 67, 22, 50,  9, 60,  2, 83, 30,  6,  7, 36,
        77, 71, 39, 43, 64, 23, 17, 41, 86, 65, 78, 31, 52, 34, 48, 28, 42

In [10]:
topic_words, word_scores, topic_nums = top2vec_model.get_topics()
topic_words[79], word_scores[79], topic_nums[79]

(array(['research methodology', 'qualitative research', 'action research',
        'qualitative data', 'grounded theory', 'research questions',
        'software measurement', 'present study', 'digital repository',
        'software engineering', 'software development',
        'provisioning model', 'scientific research', 'pilot study',
        'software system', 'design science', 'proposed methodology',
        'content analysis', 'questionnaire was', 'to analyze',
        'sub processes', 'case study', 'data sources', 'itbm ontology',
        'design modeling', 'web archiving', 'object oriented',
        'are summarized', 'this study', 'expert interviews',
        'evaluation metrics', 'engineering design',
        'security practitioners', 'google scholar', 'data collection',
        'icase tools', 'case studies', 'review process',
        'means clustering', 'to develop', 'knowledge architecture',
        'professional competence', 'information retrieval',
        'social science',

In [11]:
topic_words, word_scores, topic_nums = top2vec_model.get_topics()
topic_words[53], word_scores[53], topic_nums[53]

(array(['sensed images', 'feature extraction', 'optical flow',
        'extracted features', 'region fidelity', 'image enhancement',
        'semi supervised', 'saliency map', 'each pixel', 'medical images',
        'spatial resolution', 'satellite images', 'sparse coding',
        'feature maps', 'feature vectors', 'similarity measure',
        'remote sensing', 'pattern recognition', 'classification tasks',
        'intensity saliency', 'features among', 'brain images',
        'eye tracker', 'instance learning', 'sensory data',
        'conjugate features', 'binary mask', 'visual abstraction',
        'feature map', 'machine svm', 'nearest neighbor',
        'labeled dataset', 'feature selection', 'rgb based',
        'proposed algorithm', 'support vector', 'similarity scores',
        'cover image', 'deep multi', 'supervised classification',
        'clustering method', 'spectrum saliency', 'brain image',
        'convolutional layers', 'product features',
        'classification a

In [14]:
topic_words, word_scores, topic_scores, topic_nums = top2vec_model.search_topics(keywords=['research methodology'], num_topics=10)
topic_words, word_scores, topic_scores, topic_nums

([array(['research methodology', 'qualitative research', 'action research',
         'qualitative data', 'grounded theory', 'research questions',
         'software measurement', 'present study', 'digital repository',
         'software engineering', 'software development',
         'provisioning model', 'scientific research', 'pilot study',
         'software system', 'design science', 'proposed methodology',
         'content analysis', 'questionnaire was', 'to analyze',
         'sub processes', 'case study', 'data sources', 'itbm ontology',
         'design modeling', 'web archiving', 'object oriented',
         'are summarized', 'this study', 'expert interviews',
         'evaluation metrics', 'engineering design',
         'security practitioners', 'google scholar', 'data collection',
         'icase tools', 'case studies', 'review process',
         'means clustering', 'to develop', 'knowledge architecture',
         'professional competence', 'information retrieval',
         '

In [5]:
# sentence_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'
sentence_transformer_model_name = 'sentence-transformers/all-distilroberta-v1'

st_embed_model = SentenceTransformer(sentence_transformer_model_name)
timelogger.log('SentenceTransformer initialized')

 :: SentenceTransformer initialized | since_start: 4.0 hours, 8.0 minutes, 42.28 seconds | since_last: 4.0 hours, 8.0 minutes, 42.28 seconds :: 


' :: SentenceTransformer initialized | since_start: 4.0 hours, 8.0 minutes, 42.28 seconds | since_last: 4.0 hours, 8.0 minutes, 42.28 seconds :: '

In [6]:
timelogger.log('generic methodology phrases encoded START')
generic_methodology_phrases = [
    'research method',
    'research methodology',
    'methodological approach',
    'experimental study',
    'empirical evaluation',
    'case study',
    'simulation study',
    'measurement study',
    'formal proof'
]

generic_methodology_phrase_embeddings = st_embed_model.encode(generic_methodology_phrases)
timelogger.log('generic methodology phrases encoded END')
generic_methodology_phrase_embeddings

 :: generic methodology phrases encoded START | since_start: 4.0 hours, 8.0 minutes, 49.24 seconds | since_last: 6.95 seconds :: 
 :: generic methodology phrases encoded END | since_start: 4.0 hours, 8.0 minutes, 49.47 seconds | since_last: 0.23 seconds :: 


array([[-0.00535334, -0.00680362,  0.00489835, ...,  0.01841467,
        -0.03167962, -0.00509518],
       [-0.00022195,  0.00414018,  0.01022067, ..., -0.01404913,
        -0.03241679, -0.02545269],
       [ 0.01189186,  0.01842937,  0.02846571, ..., -0.03430161,
        -0.02436638, -0.03020131],
       ...,
       [-0.0322062 , -0.01856575,  0.02224941, ..., -0.03424372,
        -0.04298654, -0.09180257],
       [-0.02929969, -0.07373921, -0.00920677, ...,  0.01079472,
         0.03783653, -0.05652968],
       [ 0.03346386,  0.0116179 , -0.01428203, ..., -0.07280523,
        -0.08783865, -0.02877205]], dtype=float32)

In [7]:
timelogger.log('text encoded START')
text_list_embeddings = st_embed_model.encode(texts_only_list)
timelogger.log('text encoded END')
text_list_embeddings[0:2]

 :: text encoded START | since_start: 4.0 hours, 8.0 minutes, 53.39 seconds | since_last: 3.92 seconds :: 
 :: text encoded END | since_start: 4.0 hours, 9.0 minutes, 1.10 seconds | since_last: 7.72 seconds :: 


array([[ 0.04587457,  0.02019155,  0.01326159, ..., -0.10766116,
         0.05190924,  0.00696904],
       [-0.00781274, -0.03953061, -0.00649839, ..., -0.03321218,
        -0.0304147 , -0.00362403]], dtype=float32)

In [8]:
timelogger.log('initiate pos_vectorizer START')
pos_pattern = (
    # Verb-led methodological action (optional subject/aux/adv, main verb, noun phrase core, optional chained PPs)
    # we derive MIMO processing matrices
    # we compare the performance
    # we factor the MU MIMO precoding matrix
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' #+ '|'
    
    # Nominal methodological construct (adjective/noun/proper stacks + optional PP tails)
    # singular value decomposition
    # MIMO processing matrices
    # regularized block diagonal AF algorithm
    # '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' # + '|'
    
    # Metric/result short form
    # bit error rate
    # BER performance
    # SNR gain
    # '(<J.*>*<N.*>+)'
)
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    # pos_pattern=pos_pattern, # '<J.*>*<N.*>+',
    # min_df=1, # cutoff
)
timelogger.log('initiate pos_vectorizer END')

 :: initiate pos_vectorizer START | since_start: 4.0 hours, 9.0 minutes, 8.40 seconds | since_last: 7.30 seconds :: 
 :: initiate pos_vectorizer END | since_start: 4.0 hours, 9.0 minutes, 8.40 seconds | since_last: 0.00 seconds :: 


' :: initiate pos_vectorizer END | since_start: 4.0 hours, 9.0 minutes, 8.40 seconds | since_last: 0.00 seconds :: '

In [9]:
timelogger.log('initiate keybert_model START')
keybert_model = KeyBERT(model=st_embed_model)
timelogger.log('initiate keybert_model END')

 :: initiate keybert_model START | since_start: 4.0 hours, 9.0 minutes, 27.39 seconds | since_last: 18.98 seconds :: 
 :: initiate keybert_model END | since_start: 4.0 hours, 9.0 minutes, 27.39 seconds | since_last: 0.00 seconds :: 


' :: initiate keybert_model END | since_start: 4.0 hours, 9.0 minutes, 27.39 seconds | since_last: 0.00 seconds :: '

In [None]:
print(sentence_transformer_model_name)
timelogger.log('extract 10 keywords START')
initial_keywords_top10 = keybert_model.extract_keywords(  # TODO: consider using this with candidates=[...] for the next round
    docs=texts_only_list[0:100],
    top_n=10,
    vectorizer=pos_vectorizer,
    use_maxsum=True,
    nr_candidates=100
)
initial_keyword_embeddings = st_embed_model.encode([kw[0] for kw in initial_keywords_top10])
timelogger.log('extract 10 keywords END')
initial_keywords_top10

sentence-transformers/all-distilroberta-v1
 :: extract 10 keywords START | since_start: 4.0 hours, 9.0 minutes, 51.72 seconds | since_last: 24.34 seconds :: 


In [106]:
timelogger.log('extract 100 keywords START')
initial_keywords_top100 = keybert_model.extract_keywords(  # TODO: consider using this with candidates=[...] for the next round
    docs=texts_only_list,
    top_n=100,
    vectorizer=pos_vectorizer
)
initial_keyword_embeddings = st_embed_model.encode([kw[0] for kw in initial_keywords_top100])
timelogger.log('extract 100 keywords END')
initial_keywords_top100

 :: extract 100 keywords START | since_start: 1.0 hour, 14.0 minutes, 2.27 seconds | since_last: 0.38 seconds :: 
 :: extract 100 keywords END | since_start: 1.0 hour, 14.0 minutes, 8.74 seconds | since_last: 6.47 seconds :: 


[('assuming mu mimo channel', 0.5829),
 ('design mu mimo', 0.5529),
 ('design mimo', 0.5449),
 ('find optimum mimo processing matrices', 0.5336),
 ('perform mimo', 0.5299),
 ('limitations mimo processing matrices', 0.5292),
 ('resulting mimo processing matrices', 0.5119),
 ('jointly optimizes mimo processing matrices bs', 0.5002),
 ('using mimo channel matrices bs rn', 0.4936),
 ('consider mu mimo dl system', 0.4799),
 ('derive mimo processing matrices', 0.4728),
 ('factor mu mimo', 0.4699),
 ('rn mimo', 0.4653),
 ('design mimo processing matrices bs', 0.4631),
 ('extend mimo', 0.4596),
 ('estimate effective mimo matrix', 0.4501),
 ('using mimo channel matrix bs rn', 0.4466),
 ('function mimo processing matrix', 0.4409),
 ('multiple antennas', 0.4325),
 ('derive mimo processing matrices bs', 0.4223),
 ('derived mimo processing matrices bs', 0.4211),
 ('assuming mimo', 0.42),
 ('kth ut mimo', 0.4147),
 ('fading channels', 0.4047),
 ('minimize mui kth ut co - channel uts', 0.3998),
 ('pr

In [107]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

top_n = 10
pairwise_distances = euclidean_distances(initial_keyword_embeddings, generic_methodology_phrase_embeddings)
closest_distances_per_keyword = [[index, min(ds)] for index, ds in enumerate(pairwise_distances)]
sorted_closest_distances_per_keyword = sorted(closest_distances_per_keyword, key=lambda x: x[1])
keywords = [initial_keywords_top100[index][0] for (index, distance) in sorted_closest_distances_per_keyword]
keywords

['present results simulations',
 'estimates h',
 'proposed algorithm performance system',
 'perform resource allocation',
 'proposed system',
 'estimate h',
 'compare performance rbd af algorithm',
 'received power kth ut',
 'compare bit error rate',
 'describe relaying system',
 'estimate h 2,k f r k',
 'denotes number',
 'investigate different power allocation algorithms',
 'using following optimization',
 'using matrices f r',
 'design mimo',
 'provide information additive noise variances receivers transmitters',
 'describe antenna configuration system',
 'perform mimo',
 'design mu mimo',
 'optimize kth ut performance',
 'denotes kth ut',
 'channel estimation errors',
 'negligible performance loss',
 'provide reliable transmission',
 'estimate effective mimo matrix',
 'serving users',
 'multi - user',
 'receive matrices rn',
 'consider mu mimo dl system',
 'receive matrix rn',
 'minimizes mu interference',
 'kth ut mimo',
 'minimize mu interference',
 'denotes additive noise correl

In [70]:
closest_distances_per_keyword

[[0, 1.4426476], [1, 1.4687278]]

In [63]:
distances

array([[1.4426476, 1.4526964],
       [1.4687278, 1.4915861]], dtype=float32)