In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import os, json, re, argparse, math
from pathlib import Path
from collections import Counter
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import spacy.cli
spacy_model = 'en_core_web_sm'
spacy.cli.download(spacy_model)
spacy_exclude = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']
nlp = spacy.load(spacy_model, exclude=spacy_exclude)
timelogger = utils.TimeLogger()
"""
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
year={2022},
pages={243-248},
publisher={SciTePress},
organization={INSTICC},
doi={10.5220/0011546600003335},
isbn={978-989-758-614-9},
issn={2184-3228},
}
"""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m153.8 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 


'\n@conference{schopf_etal_kdir22,\nauthor={Tim Schopf and Simon Klimek and Florian Matthes},\ntitle={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},\nbooktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},\nyear={2022},\npages={243-248},\npublisher={SciTePress},\norganization={INSTICC},\ndoi={10.5220/0011546600003335},\nisbn={978-989-758-614-9},\nissn={2184-3228},\n}\n'

## Methodology Chapter

In [3]:
database_name = '02_stg'
table_name = 'stg_filtered_work_chapters_methodology_single'
id_columns = ['work_id']
text_column_name = 'chapter_text'

text_batch_size = 3

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    OFFSET 
        100
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0][0:100]

'In age estimation, the most popular performance measurement is the mean absolute error (MAE), which '

In [4]:
# sentence_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'
sentence_transformer_model_name = 'sentence-transformers/all-distilroberta-v1'
st_embed_model = SentenceTransformer(sentence_transformer_model_name)

In [23]:
timelogger.log('text encoded START')
text_list_embeddings = st_embed_model.encode(texts_only_list)
timelogger.log('text encoded END')
text_list_embeddings[0:2]

 :: text encoded START | since_start: 3.0 hours, 50.0 minutes, 13.76 seconds | since_last: 2.0 hours, 52.0 minutes, 18.85 seconds :: 
 :: text encoded END | since_start: 3.0 hours, 50.0 minutes, 14.09 seconds | since_last: 0.33 seconds :: 


array([[-0.0093935 ,  0.00672289, -0.0115801 , ..., -0.00489294,
        -0.00844138,  0.01074106],
       [ 0.01454419, -0.04212667, -0.00838728, ..., -0.04204104,
        -0.00282833, -0.02174634]], dtype=float32)

In [24]:
timelogger.log('initiate pos_vectorizer START')
pos_pattern = (
    # Verb-led methodological action (optional subject/aux/adv, main verb, noun phrase core, optional chained PPs)
    # we derive MIMO processing matrices
    # we compare the performance
    # we factor the MU MIMO precoding matrix
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' #+ '|'
    
    # Nominal methodological construct (adjective/noun/proper stacks + optional PP tails)
    # singular value decomposition
    # MIMO processing matrices
    # regularized block diagonal AF algorithm
    # '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' # + '|'
    
    # Metric/result short form
    # bit error rate
    # BER performance
    # SNR gain
    # '(<J.*>*<N.*>+)'
)
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    # pos_pattern=pos_pattern, # '<J.*>*<N.*>+',
    # min_df=1, # cutoff
)
timelogger.log('initiate pos_vectorizer END')

 :: initiate pos_vectorizer START | since_start: 3.0 hours, 50.0 minutes, 14.91 seconds | since_last: 0.82 seconds :: 
 :: initiate pos_vectorizer END | since_start: 3.0 hours, 50.0 minutes, 14.91 seconds | since_last: 0.00 seconds :: 


' :: initiate pos_vectorizer END | since_start: 3.0 hours, 50.0 minutes, 14.91 seconds | since_last: 0.00 seconds :: '

In [8]:
timelogger.log('initiate keybert_model START')
keybert_model = KeyBERT(model=st_embed_model)
timelogger.log('initiate keybert_model END')

 :: initiate keybert_model START | since_start: 7.33 seconds | since_last: 0.01 seconds :: 
 :: initiate keybert_model END | since_start: 7.33 seconds | since_last: 0.00 seconds :: 


' :: initiate keybert_model END | since_start: 7.33 seconds | since_last: 0.00 seconds :: '

In [9]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
)
timelogger.log('extract keywords | Default | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default | START | since_start: 7.34 seconds | since_last: 0.01 seconds :: 
 :: extract keywords | Default | END | since_start: 8.20 seconds | since_last: 0.86 seconds :: 


[[('age', 0.3123),
  ('ages', 0.3058),
  ('label', 0.2091),
  ('estimation', 0.1949),
  ('estimated', 0.1192)],
 [('gpu', 0.4938),
  ('cpu', 0.3882),
  ('cpu2006', 0.3607),
  ('amd', 0.3017),
  ('benchmarks', 0.2952)],
 [('questionnaire', 0.336),
  ('survey', 0.3206),
  ('descriptive', 0.2658),
  ('structured', 0.265),
  ('quantitative', 0.2584)]]

In [10]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
)
timelogger.log('extract keywords | Default Top20 | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 | START | since_start: 8.21 seconds | since_last: 0.01 seconds :: 
 :: extract keywords | Default Top20 | END | since_start: 9.07 seconds | since_last: 0.87 seconds :: 


[[('age', 0.3123),
  ('ages', 0.3058),
  ('label', 0.2091),
  ('estimation', 0.1949),
  ('estimated', 0.1192),
  ('mean', 0.0881),
  ('predicted', 0.0786),
  ('method', 0.0744),
  ('ldl', 0.071),
  ('images', 0.0679),
  ('algorithm', 0.0675),
  ('lld', 0.066),
  ('measurement', 0.0633),
  ('methods', 0.056),
  ('sdm', 0.0456),
  ('triangle', 0.045),
  ('figure', 0.0443),
  ('degrees', 0.0436),
  ('gaussian', 0.0335),
  ('distribution', 0.0262)],
 [('gpu', 0.4938),
  ('cpu', 0.3882),
  ('cpu2006', 0.3607),
  ('amd', 0.3017),
  ('benchmarks', 0.2952),
  ('benchmark', 0.2919),
  ('radeon', 0.2751),
  ('x86', 0.2554),
  ('cache', 0.2554),
  ('processor', 0.2542),
  ('compute', 0.2307),
  ('cores', 0.2213),
  ('performance', 0.2094),
  ('spec', 0.2006),
  ('gpus', 0.1865),
  ('memory', 0.1749),
  ('throughput', 0.1744),
  ('buffers', 0.1661),
  ('5870', 0.1629),
  ('sdram', 0.1625)],
 [('questionnaire', 0.336),
  ('survey', 0.3206),
  ('descriptive', 0.2658),
  ('structured', 0.265),
  ('qu

In [11]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
)
timelogger.log('extract keywords | Default Top20 MaxSum | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum | START | since_start: 9.08 seconds | since_last: 0.01 seconds :: 
 :: extract keywords | Default Top20 MaxSum | END | since_start: 9.92 seconds | since_last: 0.84 seconds :: 


[[('distribution', 0.0262),
  ('gaussian', 0.0335),
  ('degrees', 0.0436),
  ('figure', 0.0443),
  ('triangle', 0.045),
  ('sdm', 0.0456),
  ('methods', 0.056),
  ('measurement', 0.0633),
  ('lld', 0.066),
  ('algorithm', 0.0675),
  ('images', 0.0679),
  ('ldl', 0.071),
  ('method', 0.0744),
  ('predicted', 0.0786),
  ('mean', 0.0881),
  ('estimated', 0.1192),
  ('estimation', 0.1949),
  ('label', 0.2091),
  ('ages', 0.3058),
  ('age', 0.3123)],
 [('sdram', 0.1625),
  ('5870', 0.1629),
  ('buffers', 0.1661),
  ('throughput', 0.1744),
  ('memory', 0.1749),
  ('gpus', 0.1865),
  ('spec', 0.2006),
  ('performance', 0.2094),
  ('cores', 0.2213),
  ('compute', 0.2307),
  ('processor', 0.2542),
  ('cache', 0.2554),
  ('x86', 0.2554),
  ('radeon', 0.2751),
  ('benchmark', 0.2919),
  ('benchmarks', 0.2952),
  ('amd', 0.3017),
  ('cpu2006', 0.3607),
  ('cpu', 0.3882),
  ('gpu', 0.4938)],
 [('method', 0.1307),
  ('studied', 0.1335),
  ('researches', 0.1395),
  ('conclusions', 0.1424),
  ('compos

In [12]:
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV | START | since_start: 9.93 seconds | since_last: 0.01 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV | END | since_start: 10.77 seconds | since_last: 0.84 seconds :: 


[[('lld', 0.066),
  ('popular performance measurement', 0.0673),
  ('algorithm', 0.0675),
  ('ldl', 0.071),
  ('respective features', 0.0808),
  ('following ldl', 0.0916),
  ('original ldl', 0.1479),
  ('test images', 0.1614),
  ('triangle distribution', 0.197),
  ('authentic age', 0.2033),
  ('lld method', 0.2097),
  ('p(y|x', 0.2288),
  ('ldl methods', 0.232),
  ('chronological age', 0.2663),
  ('mean absolute error', 0.2967),
  ('ages', 0.3058),
  ('age', 0.3123),
  ('multi - label distribution', 0.3193),
  ('age estimation', 0.4403),
  ('age label distribution', 0.5354)],
 [('different performance requirements', 0.4671),
  ('gpu applications', 0.4736),
  ('internal gpu caches', 0.4904),
  ('gpu', 0.4938),
  ('cpu caches', 0.4964),
  ('gpu relative', 0.5036),
  ('gpu speedup', 0.5105),
  ('graphics performance benchmarks', 0.5192),
  ('cpu benchmark', 0.5234),
  ('cpu benchmarks', 0.532),
  ('gpu performance', 0.5433),
  ('gpu traces', 0.5437),
  ('gpu memory accesses', 0.5502),
  (

In [13]:
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    pos_pattern='(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)',
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | START | since_start: 10.78 seconds | since_last: 0.01 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | END | since_start: 11.38 seconds | since_last: 0.60 seconds :: 


[[],
 [('shows mpki cpu benchmark', 0.3544),
  ('model memory', 0.358),
  ('propose integrated cpu', 0.3651),
  ('multiprogrammed workloads', 0.3677),
  ('determines weight gpu relative cpu core', 0.3886),
  ('share cpu caches', 0.39),
  ('measure performance multi - core cpu', 0.4042),
  ('prevent gpu requests', 0.4154),
  ('shows detailed system parameters evaluated cpu', 0.4234),
  ('include memory requests', 0.4246),
  ('measuring overall system performance integrated cpu', 0.4479),
  ('measure performance gpu', 0.4653),
  ('compute gpu speedup ratio', 0.4669),
  ('different performance requirements', 0.4671),
  ('classify cpu benchmarks', 0.4935),
  ('commonly used graphics performance', 0.4967),
  ('shows memory intensity gpu applications', 0.5181),
  ('running gpu benchmark', 0.5502),
  ('model internal gpu caches', 0.563),
  ('perform coalescing gpu memory requests', 0.624)],
 []]

In [14]:
timelogger.log('initiate pos_vectorizer START')
pos_pattern = (
    # Verb-led methodological action (optional subject/aux/adv, main verb, noun phrase core, optional chained PPs)
    # "derive MIMO processing matrices"
    # "compare the performance"
    # "factor the MU MIMO precoding matrix"
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' + '|'
    
    # Nominal methodological construct (adjective/noun/proper stacks + optional PP tails)
    # "singular value decomposition"
    # "MIMO processing matrices"
    # "regularized block diagonal AF algorithm"
    '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' + '|'
    
    # Metric/result short form
    # "bit error rate"
    # "BER performance"
    # "SNR gain"
    '(<J.*>*<N.*>+)'
)

pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    pos_pattern=pos_pattern,
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | END')
initial_keywords

 :: initiate pos_vectorizer START | since_start: 11.39 seconds | since_last: 0.01 seconds :: 
sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | START | since_start: 11.39 seconds | since_last: 0.00 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | END | since_start: 12.13 seconds | since_last: 0.75 seconds :: 


[[('popular performance measurement', 0.0673),
  ('ldl', 0.071),
  ('proposed algorithm', 0.0729),
  ('based experiments', 0.0757),
  ('respective features', 0.0808),
  ('following ldl', 0.0916),
  ('including original ldl', 0.1741),
  ('number test images', 0.1934),
  ('triangle distribution', 0.197),
  ('authentic age', 0.2033),
  ('existing ldl methods', 0.205),
  ('lld method', 0.2097),
  ('computing p(y|x', 0.2476),
  ('given chronological age', 0.2959),
  ('mean absolute error', 0.2967),
  ('multi - label distribution equal description degrees', 0.3081),
  ('predicted ages', 0.3591),
  ('estimated age', 0.3689),
  ('age estimation', 0.4403),
  ('initially generating age label distribution', 0.5691)],
 [('gpu application', 0.453),
  ('measure performance gpu', 0.4653),
  ('compute gpu speedup ratio', 0.4669),
  ('different performance requirements', 0.4671),
  ('classify cpu benchmarks', 0.4935),
  ('gpu', 0.4938),
  ('commonly used graphics performance', 0.4967),
  ('shows memory

In [22]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

timelogger.log('generic methodology phrases encoded START')
generic_methodology_phrases = [
    'research method',
    'research methodology',
    'methodological approach',
    'experimental study',
    'empirical evaluation',
    'case study',
    'simulation study',
    'measurement study',
    'formal proof'
]

generic_methodology_phrase_embeddings = st_embed_model.encode(generic_methodology_phrases)
timelogger.log('generic methodology phrases encoded END')
generic_methodology_phrase_embeddings

for i in range(0, 3):
    single_initial_keywords = [e[0] for e in initial_keywords[i]]
    single_initial_keywords_embeddings = st_embed_model.encode(single_initial_keywords)

    top_n = 10
    # A*B number of distances
    pairwise_distances = euclidean_distances(single_initial_keywords_embeddings, generic_methodology_phrase_embeddings)
    # All the initial keywords, index:distance pairs
    closest_distances_per_keyword = [[index, min(ds)] for index, ds in enumerate(pairwise_distances)]
    # All the initial keywords, index:distance pairs, sorted by distance
    sorted_closest_distances_per_keyword = sorted(closest_distances_per_keyword, key=lambda x: x[1])
    keywords = [(single_initial_keywords[index], float(distance)) for (index, distance) in sorted_closest_distances_per_keyword]
    display(keywords[0:5])

 :: generic methodology phrases encoded START | since_start: 57.0 minutes, 54.86 seconds | since_last: 50.90 seconds :: 
 :: generic methodology phrases encoded END | since_start: 57.0 minutes, 54.91 seconds | since_last: 0.05 seconds :: 


[('based experiments', 0.8362430930137634),
 ('popular performance measurement', 1.016200304031372),
 ('lld method', 1.0561422109603882),
 ('number test images', 1.1788290739059448),
 ('proposed algorithm', 1.187438726425171)]

[('gpu simulation framework', 1.0466086864471436),
 ('measure performance gpu', 1.1677958965301514),
 ('gpu performance', 1.194872498512268),
 ('cpu benchmark', 1.208459734916687),
 ('graphics performance benchmarks', 1.2343451976776123)]

[('research method', 9.592459946361487e-07),
 ('research', 0.7285956740379333),
 ('descriptive research', 0.7696881890296936),
 ('quantitative analysis', 0.8940402269363403),
 ('analysis', 0.9342982769012451)]