In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import os, json, re, argparse, math
from pathlib import Path
from collections import Counter
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import spacy.cli
spacy_model = 'en_core_web_sm'
spacy.cli.download(spacy_model)
spacy_exclude = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']
nlp = spacy.load(spacy_model, exclude=spacy_exclude)
timelogger = utils.TimeLogger()
"""
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
year={2022},
pages={243-248},
publisher={SciTePress},
organization={INSTICC},
doi={10.5220/0011546600003335},
isbn={978-989-758-614-9},
issn={2184-3228},
}
"""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.1 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 


'\n@conference{schopf_etal_kdir22,\nauthor={Tim Schopf and Simon Klimek and Florian Matthes},\ntitle={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},\nbooktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},\nyear={2022},\npages={243-248},\npublisher={SciTePress},\norganization={INSTICC},\ndoi={10.5220/0011546600003335},\nisbn={978-989-758-614-9},\nissn={2184-3228},\n}\n'

In [3]:
database_name = '02_stg'
table_name = 'stg_filtered_work_chapters_methodology_single'
id_columns = ['work_id']
text_column_name = 'chapter_text'
text_batch_size = 1000

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0][0:100]

'As was noted earlier, we formed our dataset utilizing the existing relations present in SNOMED CT. T'

In [4]:
from top2vec import Top2Vec

# Create a Contextual Top2Vec model
top2vec_model = Top2Vec(documents=texts_only_list,
                        ngram_vocab=True,
                        contextual_top2vec=True)

2025-09-21 17:36:00,554 - top2vec - INFO - Pre-processing documents for training
2025-09-21 17:36:01,669 - top2vec - INFO - Creating vocabulary embedding
Embedding vocabulary: 100%|██████████| 30/30 [00:36<00:00,  1.22s/it]
2025-09-21 17:36:38,778 - top2vec - INFO - Create contextualized document embeddings
Embedding documents: 100%|██████████| 32/32 [00:35<00:00,  1.10s/it]
1000it [00:00, 7291.47it/s]
2025-09-21 17:37:17,841 - top2vec - INFO - Creating lower dimension embedding of documents
2025-09-21 17:37:45,394 - top2vec - INFO - Finding dense areas of documents
2025-09-21 17:37:45,573 - top2vec - INFO - Finding topics
Smoothing document token embeddings: 100%|██████████| 1000/1000 [00:04<00:00, 246.96it/s]
Calculating document topic distributions: 100%|██████████| 1000/1000 [00:00<00:00, 3836.20it/s]


In [5]:
top2vec_model.get_num_topics()

88

In [6]:
top2vec_model.get_topic_sizes()

(array([17641, 10604,  9503,  7937,  7685,  7272,  6715,  6636,  6583,
         6236,  5603,  5600,  5125,  5113,  5025,  4456,  4379,  3984,
         3845,  3834,  3791,  3739,  3317,  3287,  3262,  3209,  2918,
         2915,  2875,  2825,  2744,  2734,  2720,  2696,  2667,  2644,
         2639,  2574,  2543,  2513,  2425,  2415,  2383,  2334,  2315,
         2288,  2256,  2241,  2209,  2125,  2056,  2034,  2024,  2022,
         1994,  1979,  1920,  1861,  1855,  1832,  1768,  1753,  1737,
         1670,  1627,  1614,  1607,  1598,  1581,  1514,  1476,  1429,
         1405,  1402,  1372,  1311,  1234,  1212,  1201,  1138,  1074,
         1028,  1020,  1013,  1010,   853,   769,   727]),
 array([52, 84, 87,  0, 83, 74, 76, 53, 49, 39, 77, 63, 70, 65, 67, 34, 26,
        31, 72, 78, 64, 66,  3, 75, 55, 41, 35, 82, 32,  8, 46,  9, 37, 50,
         1, 40, 56,  5,  6, 62, 28, 58, 43, 25, 29, 86, 15, 20, 68, 79, 51,
        11, 36, 48, 73, 69, 44, 22, 47, 85, 71, 24,  2, 19, 30, 80, 81, 14

In [10]:
topic_words, word_scores, topic_nums = top2vec_model.get_topics()
topic_words[79], word_scores[79], topic_nums[79]

(array(['research methodology', 'qualitative research', 'action research',
        'qualitative data', 'grounded theory', 'research questions',
        'software measurement', 'present study', 'digital repository',
        'software engineering', 'software development',
        'provisioning model', 'scientific research', 'pilot study',
        'software system', 'design science', 'proposed methodology',
        'content analysis', 'questionnaire was', 'to analyze',
        'sub processes', 'case study', 'data sources', 'itbm ontology',
        'design modeling', 'web archiving', 'object oriented',
        'are summarized', 'this study', 'expert interviews',
        'evaluation metrics', 'engineering design',
        'security practitioners', 'google scholar', 'data collection',
        'icase tools', 'case studies', 'review process',
        'means clustering', 'to develop', 'knowledge architecture',
        'professional competence', 'information retrieval',
        'social science',

In [11]:
topic_words, word_scores, topic_nums = top2vec_model.get_topics()
topic_words[53], word_scores[53], topic_nums[53]

(array(['sensed images', 'feature extraction', 'optical flow',
        'extracted features', 'region fidelity', 'image enhancement',
        'semi supervised', 'saliency map', 'each pixel', 'medical images',
        'spatial resolution', 'satellite images', 'sparse coding',
        'feature maps', 'feature vectors', 'similarity measure',
        'remote sensing', 'pattern recognition', 'classification tasks',
        'intensity saliency', 'features among', 'brain images',
        'eye tracker', 'instance learning', 'sensory data',
        'conjugate features', 'binary mask', 'visual abstraction',
        'feature map', 'machine svm', 'nearest neighbor',
        'labeled dataset', 'feature selection', 'rgb based',
        'proposed algorithm', 'support vector', 'similarity scores',
        'cover image', 'deep multi', 'supervised classification',
        'clustering method', 'spectrum saliency', 'brain image',
        'convolutional layers', 'product features',
        'classification a

In [14]:
topic_words, word_scores, topic_scores, topic_nums = top2vec_model.search_topics(keywords=['research methodology'], num_topics=10)
topic_words, word_scores, topic_scores, topic_nums

([array(['research methodology', 'qualitative research', 'action research',
         'qualitative data', 'grounded theory', 'research questions',
         'software measurement', 'present study', 'digital repository',
         'software engineering', 'software development',
         'provisioning model', 'scientific research', 'pilot study',
         'software system', 'design science', 'proposed methodology',
         'content analysis', 'questionnaire was', 'to analyze',
         'sub processes', 'case study', 'data sources', 'itbm ontology',
         'design modeling', 'web archiving', 'object oriented',
         'are summarized', 'this study', 'expert interviews',
         'evaluation metrics', 'engineering design',
         'security practitioners', 'google scholar', 'data collection',
         'icase tools', 'case studies', 'review process',
         'means clustering', 'to develop', 'knowledge architecture',
         'professional competence', 'information retrieval',
         '

In [12]:
from top2vec import Top2Vec

# Create a Contextual Top2Vec model
top2vec_model = Top2Vec(documents=texts_only_list,
                        ngram_vocab=True,
                        contextual_top2vec=False)

2025-09-21 18:00:04,186 - top2vec - INFO - Pre-processing documents for training
2025-09-21 18:00:06,119 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-09-21 18:00:07,471 - top2vec - INFO - Creating joint document/word embedding
2025-09-21 18:01:40,918 - top2vec - INFO - Creating lower dimension embedding of documents
2025-09-21 18:01:52,532 - top2vec - INFO - Finding dense areas of documents
2025-09-21 18:01:52,558 - top2vec - INFO - Finding topics


In [15]:
top2vec_model.get_num_topics()

2