In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import os, json, re, argparse, math
from pathlib import Path
from collections import Counter
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import spacy.cli
spacy_model = 'en_core_web_sm'
spacy.cli.download(spacy_model)
spacy_exclude = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']
nlp = spacy.load(spacy_model, exclude=spacy_exclude)
timelogger = utils.TimeLogger()
"""
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
year={2022},
pages={243-248},
publisher={SciTePress},
organization={INSTICC},
doi={10.5220/0011546600003335},
isbn={978-989-758-614-9},
issn={2184-3228},
}
"""

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m181.3 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 


'\n@conference{schopf_etal_kdir22,\nauthor={Tim Schopf and Simon Klimek and Florian Matthes},\ntitle={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},\nbooktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},\nyear={2022},\npages={243-248},\npublisher={SciTePress},\norganization={INSTICC},\ndoi={10.5220/0011546600003335},\nisbn={978-989-758-614-9},\nissn={2184-3228},\n}\n'

In [3]:
database_name = '02_stg'
table_name = 'stg_filtered_work_chapters_methodology_single'
id_columns = ['work_id']
text_column_name = 'chapter_text'
text_batch_size = 1000

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0]

'As was noted earlier, we formed our dataset utilizing the existing relations present in SNOMED CT. There are hundreds of different kinds relations present in SNOMED CT, some of them are more important than others (examples of some of the unimportant relations are "duplicate concept" and "inactive concept"). We report our results on the fourteen important and most frequent relations, each of which had more than ten thousand instances. The "is a(procedure,procedure)" relation had the highest number of 93, 925 instances. For each relation, positive examples for both training and testing were randomly selected without replacement as pairs of concepts for which the relation is known to exist. Then equal number of negative examples were randomly selected without replacement as pairs of concepts of the required types which are not related by that relation. There was no overlap between training and testing datasets. We employed SVM using the LibSVM package3 along with the user-defined kernel 

In [81]:
database_name = '03_core'
table_name = 'unified_works'
id_columns = ['id']
text_column_name = 'fulltext'
text_batch_size = 1

utils.pd_set_options(cols=100)
id_column_names = ', '.join(id_columns)
# TODO: pagination
texts_df = wr.athena.read_sql_query(f"""
    SELECT
        {id_column_names}, {text_column_name} 
    FROM
        "{database_name}".{table_name}
    ORDER BY
        {id_column_names}
    LIMIT
        {text_batch_size}
    """,
    database_name
)
texts_only_list = texts_df[text_column_name].tolist()
texts_only_list[0]

'\nIntroduction\n\nAn important part of future wireless communication systems is multi-user (MU) multiple-input multiple-output (MIMO) processing. It has been shown that the linear increase of the MU MIMO systems\' data rate in the number of transmit antennas can be achieved by serving users simultaneously using the space-division multiple access (SDMA) [1]. In multi-hop-based systems additional, intermediate radio access points, or relay nodes (RNs), are used to reduce distances between individual nodes and simultaneously improve the channel conditions. The relays traditionally have been used to mitigate the effect of path loss for obtaining robust communication. The three-terminal relay channel where a single intermediate node supports a single communication pair was introduced in seminal paper [2]. Different relaying protocols which still serve as a basis for many relaying strategies were proposed later in [3]. The idea of relaying was first applied to wireless fading channels in [4

In [4]:
# sentence_transformer_model_name = 'sentence-transformers/all-mpnet-base-v2'
sentence_transformer_model_name = 'sentence-transformers/all-distilroberta-v1'

st_embed_model = SentenceTransformer(sentence_transformer_model_name)
timelogger.log('SentenceTransformer initialized')

 :: SentenceTransformer initialized | since_start: 8.93 seconds | since_last: 8.93 seconds :: 


' :: SentenceTransformer initialized | since_start: 8.93 seconds | since_last: 8.93 seconds :: '

In [5]:
timelogger.log('generic methodology phrases encoded START')
generic_methodology_phrases = [
    'research method',
    'research methodology',
    'methodological approach',
    'experimental study',
    'empirical evaluation',
    'case study',
    'simulation study',
    'measurement study',
    'formal proof'
]

generic_methodology_phrase_embeddings = st_embed_model.encode(generic_methodology_phrases)
timelogger.log('generic methodology phrases encoded END')
generic_methodology_phrase_embeddings

 :: generic methodology phrases encoded START | since_start: 8.93 seconds | since_last: 0.01 seconds :: 
 :: generic methodology phrases encoded END | since_start: 9.46 seconds | since_last: 0.53 seconds :: 


array([[-0.00535334, -0.00680358,  0.00489835, ...,  0.01841469,
        -0.03167967, -0.0050952 ],
       [-0.00022194,  0.00414018,  0.01022064, ..., -0.01404903,
        -0.03241681, -0.02545266],
       [ 0.01189183,  0.01842934,  0.02846571, ..., -0.03430157,
        -0.02436642, -0.03020135],
       ...,
       [-0.03220616, -0.01856573,  0.02224939, ..., -0.03424376,
        -0.04298651, -0.09180258],
       [-0.02929969, -0.07373922, -0.00920675, ...,  0.0107948 ,
         0.03783655, -0.0565296 ],
       [ 0.03346384,  0.01161797, -0.01428204, ..., -0.07280515,
        -0.08783866, -0.0287721 ]], dtype=float32)

In [6]:
timelogger.log('text encoded START')
text_list_embeddings = st_embed_model.encode(texts_only_list)
timelogger.log('text encoded END')
text_list_embeddings[0:2]

 :: text encoded START | since_start: 9.47 seconds | since_last: 0.01 seconds :: 
 :: text encoded END | since_start: 13.37 seconds | since_last: 3.91 seconds :: 


array([[ 0.0458746 ,  0.02019153,  0.01326156, ..., -0.10766113,
         0.05190925,  0.00696905],
       [-0.00781273, -0.03953062, -0.00649838, ..., -0.03321216,
        -0.03041473, -0.003624  ]], dtype=float32)

In [7]:
timelogger.log('initiate pos_vectorizer START')
pos_pattern = (
    # Verb-led methodological action (optional subject/aux/adv, main verb, noun phrase core, optional chained PPs)
    # we derive MIMO processing matrices
    # we compare the performance
    # we factor the MU MIMO precoding matrix
    '(<AUX>?<RB>?<V.*><PRT>?<DT>?<J.*>*<N.*>+(<IN><DT>?<J.*>*<N.*>+)*)' #+ '|'
    
    # Nominal methodological construct (adjective/noun/proper stacks + optional PP tails)
    # singular value decomposition
    # MIMO processing matrices
    # regularized block diagonal AF algorithm
    # '(<J.*>*<N.*>+(<N.*>+)*(<IN><J.*>*<N.*>+)*)' # + '|'
    
    # Metric/result short form
    # bit error rate
    # BER performance
    # SNR gain
    # '(<J.*>*<N.*>+)'
)
pos_vectorizer = KeyphraseCountVectorizer(
    spacy_pipeline=nlp,
    # pos_pattern=pos_pattern, # '<J.*>*<N.*>+',
    # min_df=1, # cutoff
)
timelogger.log('initiate pos_vectorizer END')

 :: initiate pos_vectorizer START | since_start: 11.0 minutes, 28.21 seconds | since_last: 11.0 minutes, 14.84 seconds :: 
 :: initiate pos_vectorizer END | since_start: 11.0 minutes, 28.21 seconds | since_last: 0.00 seconds :: 


' :: initiate pos_vectorizer END | since_start: 11.0 minutes, 28.21 seconds | since_last: 0.00 seconds :: '

In [8]:
timelogger.log('initiate keybert_model START')
keybert_model = KeyBERT(model=st_embed_model)
timelogger.log('initiate keybert_model END')

 :: initiate keybert_model START | since_start: 11.0 minutes, 29.25 seconds | since_last: 1.04 seconds :: 
 :: initiate keybert_model END | since_start: 11.0 minutes, 29.25 seconds | since_last: 0.00 seconds :: 


' :: initiate keybert_model END | since_start: 11.0 minutes, 29.25 seconds | since_last: 0.00 seconds :: '

In [None]:
print(sentence_transformer_model_name)
timelogger.log('extract 10 keywords START')
initial_keywords_top10 = keybert_model.extract_keywords(  # TODO: consider using this with candidates=[...] for the next round
    docs=texts_only_list,
    top_n=10,
    vectorizer=pos_vectorizer,
    use_maxsum=True,
    nr_candidates=100
)
initial_keyword_embeddings = st_embed_model.encode([kw[0] for kw in initial_keywords_top10])
timelogger.log('extract 10 keywords END')
initial_keywords_top10

sentence-transformers/all-distilroberta-v1
 :: extract 100 keywords START | since_start: 12.0 minutes, 3.22 seconds | since_last: 33.97 seconds :: 


In [106]:
timelogger.log('extract 100 keywords START')
initial_keywords_top100 = keybert_model.extract_keywords(  # TODO: consider using this with candidates=[...] for the next round
    docs=texts_only_list,
    top_n=100,
    vectorizer=pos_vectorizer
)
initial_keyword_embeddings = st_embed_model.encode([kw[0] for kw in initial_keywords_top100])
timelogger.log('extract 100 keywords END')
initial_keywords_top100

 :: extract 100 keywords START | since_start: 1.0 hour, 14.0 minutes, 2.27 seconds | since_last: 0.38 seconds :: 
 :: extract 100 keywords END | since_start: 1.0 hour, 14.0 minutes, 8.74 seconds | since_last: 6.47 seconds :: 


[('assuming mu mimo channel', 0.5829),
 ('design mu mimo', 0.5529),
 ('design mimo', 0.5449),
 ('find optimum mimo processing matrices', 0.5336),
 ('perform mimo', 0.5299),
 ('limitations mimo processing matrices', 0.5292),
 ('resulting mimo processing matrices', 0.5119),
 ('jointly optimizes mimo processing matrices bs', 0.5002),
 ('using mimo channel matrices bs rn', 0.4936),
 ('consider mu mimo dl system', 0.4799),
 ('derive mimo processing matrices', 0.4728),
 ('factor mu mimo', 0.4699),
 ('rn mimo', 0.4653),
 ('design mimo processing matrices bs', 0.4631),
 ('extend mimo', 0.4596),
 ('estimate effective mimo matrix', 0.4501),
 ('using mimo channel matrix bs rn', 0.4466),
 ('function mimo processing matrix', 0.4409),
 ('multiple antennas', 0.4325),
 ('derive mimo processing matrices bs', 0.4223),
 ('derived mimo processing matrices bs', 0.4211),
 ('assuming mimo', 0.42),
 ('kth ut mimo', 0.4147),
 ('fading channels', 0.4047),
 ('minimize mui kth ut co - channel uts', 0.3998),
 ('pr

In [107]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

top_n = 10
pairwise_distances = euclidean_distances(initial_keyword_embeddings, generic_methodology_phrase_embeddings)
closest_distances_per_keyword = [[index, min(ds)] for index, ds in enumerate(pairwise_distances)]
sorted_closest_distances_per_keyword = sorted(closest_distances_per_keyword, key=lambda x: x[1])
keywords = [initial_keywords_top100[index][0] for (index, distance) in sorted_closest_distances_per_keyword]
keywords

['present results simulations',
 'estimates h',
 'proposed algorithm performance system',
 'perform resource allocation',
 'proposed system',
 'estimate h',
 'compare performance rbd af algorithm',
 'received power kth ut',
 'compare bit error rate',
 'describe relaying system',
 'estimate h 2,k f r k',
 'denotes number',
 'investigate different power allocation algorithms',
 'using following optimization',
 'using matrices f r',
 'design mimo',
 'provide information additive noise variances receivers transmitters',
 'describe antenna configuration system',
 'perform mimo',
 'design mu mimo',
 'optimize kth ut performance',
 'denotes kth ut',
 'channel estimation errors',
 'negligible performance loss',
 'provide reliable transmission',
 'estimate effective mimo matrix',
 'serving users',
 'multi - user',
 'receive matrices rn',
 'consider mu mimo dl system',
 'receive matrix rn',
 'minimizes mu interference',
 'kth ut mimo',
 'minimize mu interference',
 'denotes additive noise correl

In [70]:
closest_distances_per_keyword

[[0, 1.4426476], [1, 1.4687278]]

In [63]:
distances

array([[1.4426476, 1.4526964],
       [1.4687278, 1.4915861]], dtype=float32)