In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import os, json, re, argparse, math
from pathlib import Path
from collections import Counter
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import spacy.cli
spacy_model = 'en_core_web_sm'
spacy.cli.download(spacy_model)
spacy_exclude = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']
nlp = spacy.load(spacy_model, exclude=spacy_exclude)
timelogger = utils.TimeLogger()
"""
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
year={2022},
pages={243-248},
publisher={SciTePress},
organization={INSTICC},
doi={10.5220/0011546600003335},
isbn={978-989-758-614-9},
issn={2184-3228},
}
"""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m85.3 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 


'\n@conference{schopf_etal_kdir22,\nauthor={Tim Schopf and Simon Klimek and Florian Matthes},\ntitle={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},\nbooktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},\nyear={2022},\npages={243-248},\npublisher={SciTePress},\norganization={INSTICC},\ndoi={10.5220/0011546600003335},\nisbn={978-989-758-614-9},\nissn={2184-3228},\n}\n'

## Full text

In [3]:
database_name = '03_core'
table_name = 'unified_works'
id_columns = ['id']
text_column_name = 'fulltext'
text_batch_size = 3

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    OFFSET 
        100
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0][0:100]

"\nIntroduction. -In today's normal secure communication, the communicating parties A (Alice) and B (B"

In [9]:
sentence_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'
# sentence_transformer_model_name = 'sentence-transformers/all-distilroberta-v1'
st_embed_model = SentenceTransformer(sentence_transformer_model_name)

In [10]:
_ = """
timelogger.log('generic methodology phrases encoded START')
generic_methodology_phrases = [
    'research method',
    'research methodology',
    'methodological approach',
    'experimental study',
    'empirical evaluation',
    'case study',
    'simulation study',
    'measurement study',
    'formal proof'
]

generic_methodology_phrase_embeddings = st_embed_model.encode(generic_methodology_phrases)
timelogger.log('generic methodology phrases encoded END')
generic_methodology_phrase_embeddings
"""

In [11]:
timelogger.log('text encoded START')
text_list_embeddings = st_embed_model.encode(texts_only_list)
timelogger.log('text encoded END')
text_list_embeddings[0:2]

 :: text encoded START | since_start: 2.0 minutes, 50.85 seconds | since_last: 2.0 minutes, 27.40 seconds :: 
 :: text encoded END | since_start: 2.0 minutes, 51.49 seconds | since_last: 0.63 seconds :: 


array([[ 0.03591107,  0.06595819,  0.01749212, ...,  0.06348804,
        -0.01906541, -0.01895891],
       [-0.03697886,  0.05680409, -0.00652881, ...,  0.01566553,
        -0.03112451, -0.01847183]], dtype=float32)

In [12]:
timelogger.log('initiate keybert_model START')
keybert_model = KeyBERT(model=st_embed_model)
timelogger.log('initiate keybert_model END')

 :: initiate keybert_model START | since_start: 2.0 minutes, 51.66 seconds | since_last: 0.17 seconds :: 
 :: initiate keybert_model END | since_start: 2.0 minutes, 51.66 seconds | since_last: 0.00 seconds :: 


' :: initiate keybert_model END | since_start: 2.0 minutes, 51.66 seconds | since_last: 0.00 seconds :: '

In [13]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
)
timelogger.log('extract keywords | Default | END')
initial_keywords

sentence-transformers/all-mpnet-base-v2
 :: extract keywords | Default | START | since_start: 2.0 minutes, 52.78 seconds | since_last: 1.12 seconds :: 
 :: extract keywords | Default | END | since_start: 2.0 minutes, 58.09 seconds | since_last: 5.32 seconds :: 


[[('quantum', 0.5116),
  ('encryption', 0.4144),
  ('security', 0.3795),
  ('crypto', 0.3569),
  ('eavesdropping', 0.3528)],
 [('facerecognition', 0.5524),
  ('biometrics', 0.3711),
  ('pca', 0.3464),
  ('illumination', 0.3449),
  ('face', 0.3445)],
 [('blogs', 0.5443),
  ('bloggers', 0.5297),
  ('communities', 0.5274),
  ('blog', 0.4706),
  ('blogger', 0.4674)]]

In [21]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
)
timelogger.log('extract keywords | Default Top20 | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 | START | since_start: 8.0 minutes, 48.21 seconds | since_last: 26.80 seconds :: 
 :: extract keywords | Default Top20 | END | since_start: 8.0 minutes, 59.94 seconds | since_last: 11.72 seconds :: 


[[('eavesdropping', 0.2479),
  ('secure', 0.2165),
  ('quantum', 0.1893),
  ('security', 0.1773),
  ('ideal', 0.1472),
  ('encryption', 0.137),
  ('qkd', 0.1341),
  ('hypothetical', 0.1323),
  ('quantity', 0.1289),
  ('unconditionally', 0.1236),
  ('unconditional', 0.1163),
  ('speak', 0.1111),
  ('communicating', 0.1079),
  ('exchange', 0.1076),
  ('authentication', 0.1048),
  ('ultimate', 0.1032),
  ('attacks', 0.1023),
  ('ideality', 0.1017),
  ('key', 0.1006),
  ('absolute', 0.1004)],
 [('facerecognition', 0.5805),
  ('face', 0.3206),
  ('recognition', 0.2747),
  ('biometrics', 0.2384),
  ('tracking', 0.2123),
  ('facial', 0.2052),
  ('lighting', 0.1888),
  ('camera', 0.1749),
  ('appearance', 0.1725),
  ('views', 0.1625),
  ('seen', 0.1582),
  ('identification', 0.1492),
  ('shadows', 0.1485),
  ('lights', 0.1466),
  ('model', 0.1463),
  ('frame', 0.1461),
  ('recognized', 0.1443),
  ('visibility', 0.1436),
  ('blinds', 0.1428),
  ('view', 0.1413)],
 [('blogs', 0.4902),
  ('blogge

In [22]:
print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
)
timelogger.log('extract keywords | Default Top20 MaxSum | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum | START | since_start: 9.0 minutes, 38.76 seconds | since_last: 38.83 seconds :: 
 :: extract keywords | Default Top20 MaxSum | END | since_start: 9.0 minutes, 51.06 seconds | since_last: 12.30 seconds :: 


[[('absolute', 0.1004),
  ('key', 0.1006),
  ('ideality', 0.1017),
  ('attacks', 0.1023),
  ('ultimate', 0.1032),
  ('authentication', 0.1048),
  ('exchange', 0.1076),
  ('communicating', 0.1079),
  ('speak', 0.1111),
  ('unconditional', 0.1163),
  ('unconditionally', 0.1236),
  ('quantity', 0.1289),
  ('hypothetical', 0.1323),
  ('qkd', 0.1341),
  ('encryption', 0.137),
  ('ideal', 0.1472),
  ('security', 0.1773),
  ('quantum', 0.1893),
  ('secure', 0.2165),
  ('eavesdropping', 0.2479)],
 [('view', 0.1413),
  ('blinds', 0.1428),
  ('visibility', 0.1436),
  ('recognized', 0.1443),
  ('frame', 0.1461),
  ('model', 0.1463),
  ('lights', 0.1466),
  ('shadows', 0.1485),
  ('identification', 0.1492),
  ('seen', 0.1582),
  ('views', 0.1625),
  ('appearance', 0.1725),
  ('camera', 0.1749),
  ('lighting', 0.1888),
  ('facial', 0.2052),
  ('tracking', 0.2123),
  ('biometrics', 0.2384),
  ('recognition', 0.2747),
  ('face', 0.3206),
  ('facerecognition', 0.5805)],
 [('newspad', 0.2863),
  ('feed

In [23]:
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV | START | since_start: 12.0 minutes, 19.73 seconds | since_last: 2.0 minutes, 28.67 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV | END | since_start: 12.0 minutes, 35.34 seconds | since_last: 15.61 seconds :: 


[[('secure bit exchange', 0.2704),
  ('key exchange decays', 0.2775),
  ('key exchange', 0.2889),
  ('physical crypto system', 0.2894),
  ('normal secure communication', 0.2931),
  ('slower key exchange speed', 0.2936),
  ('perfect security limit', 0.3113),
  ('unconditional security', 0.3134),
  ('ultimate security', 0.3253),
  ('perfect security', 0.3255),
  ('hypothetical quantum computer', 0.3365),
  ('key exchange system', 0.372),
  ('key exchange scheme', 0.3748),
  ('perfect unconditional security', 0.3872),
  ('quantum communicators', 0.3985),
  ('secure key exchange', 0.4294),
  ('secure key exchange system', 0.4455),
  ('theoretic security', 0.4723),
  ('quantum key distribution', 0.4864),
  ('secure key exchange systems', 0.4903)],
 [('generic face model', 0.3544),
  ('face images', 0.3752),
  ('recognition algorithm', 0.3838),
  ('recognition algorithms', 0.4039),
  ('face video sequence', 0.4123),
  ('face segmentation', 0.4213),
  ('face model', 0.4231),
  ('recognition p

In [24]:
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    pos_pattern='(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)',
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 01 | END')
initial_keywords

sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern | START | since_start: 14.0 minutes, 5.62 seconds | since_last: 1.0 minute, 30.28 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern | END | since_start: 14.0 minutes, 15.59 seconds | since_last: 9.98 seconds :: 


[[('fully crack key', 0.2201),
  ('approach perfect security limit p', 0.2486),
  ('based security', 0.2508),
  ('compromised security', 0.2517),
  ('using hypothetical quantum computer', 0.2775),
  ('thereby imperil whole system data exchange near future', 0.2838),
  ('provide general proof information theoretic', 0.2936),
  ('guarantee security', 0.2944),
  ('provides maximum achievable information noise', 0.3111),
  ('represents perfect security', 0.3292),
  ('provide general proof unconditional security kljn', 0.3472),
  ('implement key exchange scheme', 0.356),
  ('potentially reach perfect security', 0.3567),
  ('recorded key exchange', 0.3587),
  ('presents general proof unconditional security kljn system type', 0.368),
  ('characterizing security key exchange decays', 0.3895),
  ('secure key exchange scheme', 0.4244),
  ('secure key exchange system', 0.4455),
  ('based secure key exchange systems', 0.48),
  ('explore various physical phenomena secure key exchange', 0.4835)],
 [

In [None]:
pos_pattern = (
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' + '|'
    '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' + '|'
    '(<J.*>*<N.*>+)'
)


In [25]:
timelogger.log('initiate pos_vectorizer START')
pos_pattern = (
    # Verb-led methodological action (optional subject/aux/adv, main verb, noun phrase core, optional chained PPs)
    # "derive MIMO processing matrices"
    # "compare the performance"
    # "factor the MU MIMO precoding matrix"
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' + '|'
    
    # Nominal methodological construct (adjective/noun/proper stacks + optional PP tails)
    # "singular value decomposition"
    # "MIMO processing matrices"
    # "regularized block diagonal AF algorithm"
    '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' + '|'
    
    # Metric/result short form
    # "bit error rate"
    # "BER performance"
    # "SNR gain"
    '(<J.*>*<N.*>+)'
)

pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    pos_pattern=pos_pattern,
)

print(sentence_transformer_model_name)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | START')
initial_keywords = keybert_model.extract_keywords(
    docs=texts_only_list,
    top_n=20,
    use_maxsum=True,
    vectorizer=pos_vectorizer,
)
timelogger.log('extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | END')
initial_keywords

 :: initiate pos_vectorizer START | since_start: 17.0 minutes, 6.37 seconds | since_last: 2.0 minutes, 50.78 seconds :: 
sentence-transformers/all-distilroberta-v1
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | START | since_start: 17.0 minutes, 6.38 seconds | since_last: 0.00 seconds :: 
 :: extract keywords | Default Top20 MaxSum KCV pos_pattern 02 | END | since_start: 17.0 minutes, 26.31 seconds | since_last: 19.93 seconds :: 


[[('unconditional security', 0.3134),
  ('ultimate security', 0.3253),
  ('perfect security', 0.3255),
  ('represents perfect security', 0.3292),
  ('provide general proof unconditional security kljn', 0.3472),
  ('implement key exchange scheme', 0.356),
  ('similar type convergence toward perfect security', 0.3564),
  ('potentially reach perfect security', 0.3567),
  ('recorded key exchange', 0.3587),
  ('presents general proof unconditional security kljn system type', 0.368),
  ('perfect unconditional security', 0.3872),
  ('characterizing security key exchange decays', 0.3895),
  ('physical competitor quantum communicators', 0.403),
  ('secure key exchange scheme', 0.4244),
  ('secure key exchange', 0.4294),
  ('secure key exchange system', 0.4455),
  ('trace distance quantum key distribution', 0.4529),
  ('theoretic security', 0.4723),
  ('based secure key exchange systems', 0.48),
  ('explore various physical phenomena secure key exchange', 0.4835)],
 [('face database', 0.4795),
 