# MRD2SKB

## Methodology

### Preliminary Setup

In [None]:
# #### Preliminary setup.

# ## Library imports.

# Standard libraries.
import contextlib
import copy
import functools
import hashlib
import json
import logging
import inspect
import operator
import os
import pathlib
import pprint
import shutil
import time

# External libraries.
import numpy as np
import scipy as sp
import pandas as pd  # check faster alternatives?
# https://www.datarevenue.com/en-blog/pandas-vs-dask-vs-vaex-vs-modin-vs-rapids-vs-ray

# ML specific libraries.
import gensim
import spacy
import sklearn.feature_extraction.text as skl_feat_text

# Following are some libraries for fast gpu computations.
# import dask.bag as db
# import dask.array as da
# import dask.dataframe as dd
# import cupyx as cpx
# import cupy as cp
# import numba
# import swifter

# Jupyter notebook libraries.
from IPython.display import display  # enable this when converting to a script.
from IPython import get_ipython
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"


# ## General path configurations.

# Get script file absolute path.
file_abspath = None
if '__file__' not in globals():
    # We are in a .ipynb notebook, and presumably running in vscode.
    ip = get_ipython()
    file_abspath = ip.user_ns.get('__vsc_ipynb_file__', None)
else:
    # We are in a .py script.
    file_abspath = os.path.abspath(__file__)

file_basename = os.path.basename(file_abspath)
file_dirname = os.path.dirname(file_abspath)
file_name, file_ext = os.path.splitext(file_basename)


# ## Preconfigurations for module logger.
# Get the root logger, this is sometimes useful.
logger_root = logging.getLogger()
# Create logger with current module.
if 'logger' not in vars():
    logger = logging.getLogger(__name__)
    log_formatter = logging.Formatter("[%(levelname)s]: %(message)s")
    log_formatter_verbose = logging.Formatter(
        "[%(levelname)s]: %(asctime)s; File %(filename)s, line %(lineno)d, in %(funcName)s: %(message)s")
    # Create console log handler.
    log_ch = logging.StreamHandler()
    log_ch.setFormatter(log_formatter)
    logger.addHandler(log_ch)
    # Create file log handler.
    log_fh = logging.FileHandler(f"{file_abspath}.log")
    log_fh.setFormatter(log_formatter)
    # logger.addHandler(log_fh)

# Set log handler levels.
logger.setLevel(logging.DEBUG)
# log_ch.setLevel(logging.INFO)
# log_fh.setLevel(logging.DEBUG)


def log_obj(level, obj):
    """Log an object, using ipython display function."""
    # Obtain the name of the object.
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    obj_name = [obj_name for obj_name, obj_val in callers_local_vars
                if obj_val is obj][0]
    # log and display the object if log level permits.
    if level >= logger.level:
        logger.log(level, f"{obj_name} = ")
        display(obj)

# ## GPU configurations.

# Check GPU availability for supported libraries.
# logger.debug(f"{torch.cuda.is_available() = }")
# logger.debug(f"{torch.cuda.get_device_name(0) = }")
# logger.debug(f"{tf.config.list_physical_devices('GPU') = }")

# Check BLAS and LAPACK availability for supported libraries.
# logger.debug(f"{np.show_config() = }")


# Use GPU if applicable.
logger.debug(f"{spacy.prefer_gpu() = }")


# ## Timer configurations.


def timer_dec(func):
    """Print the runtime of the decorated function."""
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        """Wrap the function and time it."""
        func_name = repr(func.__name__)
        logger.debug(f"{func_name} Start")
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        logger.debug(f"{func_name} End")
        logger.info(f"{func_name} runtime: {run_time:.6f} secs.")
        return value
    return wrapper_timer


@contextlib.contextmanager
def timer_con():
    """Print the runtime of code block in the managed context."""
    logger.debug(f"Timer Start")
    start_time = time.perf_counter()
    yield
    end_time = time.perf_counter()
    run_time = end_time - start_time
    logger.debug(f"Timer End")
    logger.info(f"Runtime: {run_time:.6f} secs.")


In [None]:
# #### Project specific configurations.

""" MRD2SKB Project Directory Structure
mrd2skb/
    data/
        input/
            dict_compact_wordnet.csv
        interm/
            preproc_dict_df.pkl
            wsd_dict_df.pkl
        output/
            mrd2skb_mtx_df.pkl
            mrd2skb_list_df.pkl
            mrd2skb_sememes.txt
            mrd2skb_skb.txt
            mrd2skb_dict.npy
            mrd2skb_sememes.npy
            mrd2skb_valid_words.npy
            models/
                ...
        backup/
            mrd2skb_kro_bkp_x0=xx_x1=xx_x2=xx/
                interm/
                    ...
                output/
                    ...
            mrd2skb_top_bkp_x0=xx_x1=xx_x2=xx/
                interm/
                    ...
                output/
                    ...
    src/
        mrd2skb_kro.ipynb
        mrd2skb_top.ipynb
"""

os.chdir(file_dirname)

# ## Project specific paths and directories.
# Relative paths of input output directories.
prj_root_dir = "../"
prj_data_dir = prj_root_dir + "data/"
input_data_dir = prj_data_dir + "input/"
interm_data_dir = prj_data_dir + "interm/"
output_data_dir = prj_data_dir + "output/"
backup_data_dir = prj_data_dir + "backup/"

# Create directories if they do not exist.
pathlib.Path(interm_data_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_data_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(backup_data_dir).mkdir(parents=True, exist_ok=True)

# ## Project specific variables.
# Column names for terms and definitions from dictionary dataset.
term_colname = "lemma"
pos_colname = "posname"
senseid_colname = "senseid"
defn_colname = "definition"

# PoS tag codes dictionary. depends on the MRD.
# For WordNet, see ss_type under https://wordnet.princeton.edu/documentation/wndb5wn.
# The following code may be useful for determining these values:
# preproc_dict_df[pos_colname].unique()
pos_code_dict = {
    "noun": "n",
    "verb": "v",
    "adjective": "a",
    "adjective satellite": "a",  # this is actually "s", but we prefer "a".
    "adverb": "r",
    "default": "x",
}

# The following is useful for generating the output skb files.
pos_code_inv_dict = {
    "n": "noun",
    "v": "verb",
    "a": "adjective",
    "r": "adverb",
    "x": "NONE",
}

# UPOS tag codes dictionary. refer to the following:
# https://universaldependencies.org/u/pos/
# https://github.com/explosion/spaCy/blob/abb0ab109d33d2deaa6155a61fad649a25472f9c/spacy/glossary.py#L22
upos_code_dict = {
    "adj": "a",
    "adp": "x",
    "adv": "r",
    "aux": "x",
    "conj": "x",
    "cconj": "x",
    "det": "x",
    "intj": "x",
    "noun": "n",
    "num": "x",
    "part": "x",
    "pron": "n",
    "propn": "n",
    "punct": "x",
    "sconj": "x",
    "sym": "x",
    "verb": "v",
    "x": "x",
    "eol": "x",
    "space": "x",
    "default": "x",
}


### MRD Preprocessing

We read the MRD file as a CSV file.

The CSV file should have a row for every sense of every word.

It should contain the words and definitions in its columns.

Different senses of the same word should be sorted by frequency of use, and part of speech (POS) tags.

The particular dictionary used in this implementation is based on a sqlite database of the wordnet, obtained from:

http://sqlunet.sourceforge.net/

Specifically:

https://sourceforge.net/projects/sqlunet/files/6.0.0/sqlite/XX/sqlite-6.0.0-XX-all.zip

Which contains the file:

sqlite-XX.db

Which was further processed with the following SQL query to obtain a CSV file:

```sql
DROP VIEW IF EXISTS dict_compact;

CREATE VIEW dict_compact AS SELECT
words.*, casedwords.cased, postypes.posname, lexdomains.lexdomainname, synsets.definition
FROM words
LEFT JOIN casedwords USING (wordid)
LEFT JOIN senses USING (wordid)
LEFT JOIN synsets USING (synsetid)
LEFT JOIN lexdomains USING (lexdomainid)
LEFT JOIN postypes USING (pos)
ORDER BY
words.wordid ASC,
postypes.pos ASC,
senses.tagcount DESC,
senses.sensenum ASC;

```

In [None]:
"""The main function."""
logger.info("Reading dataframe from csv...")
orig_dict_df = pd.read_csv(
    input_data_dir + "dict_compact_wordnet.csv", encoding='utf-8')

log_obj(logging.INFO, orig_dict_df)


In [None]:
# Load NLP module.
nlp = spacy.load('en_core_web_sm')  # for efficiency.
# nlp = spacy.load('en_core_web_trf')  # for accuracy.

# Select necessary NLP pipe components.
# default selection takes around 6 mins.
# custom selection takes around 1.5 mins.
nlp.select_pipes(enable=['tagger', 'attribute_ruler', 'lemmatizer'])
logger.info(f"{nlp.pipe_names = }")
logger.info(f"{nlp.analyze_pipes() = }")


In [None]:
# # The following is a cool example of adding a custom NLP pipe to spacy.
# # However, it turns out what I want can be accomplished much more simply.

# #### Add a custom NLP pipe for lowercasing.
# # Add the attribute to store the NLP pi[e] result.
# if not spacy.tokens.Token.has_extension('lower_'):
#     spacy.tokens.Token.set_extension('lower_', default='')

# # Define the actual NLP pipe.
# @spacy.language.Language.component('lowercaser')
# def lowercaser(doc):
#    # Do something to the doc here
#    for token in doc:
#        token._.lower_ = token.lemma_.lower()
#    return doc

# # Add the pipe to end of the pipeline.
# nlp.add_pipe('lowercaser')


In [None]:
# Apply spacy preprocessing.
# Note: Spacy lemmatization keeps capitals when not using tagger. Hence, do not disable it.
# Remove null term rows. words like "NaN" are troublesome, so they are dropped.
dict_df = orig_dict_df.dropna(subset=[term_colname])
# Use smaller dataset for testing (optional).
# dict_df = dict_df.iloc[:50000, :]  # todo: remove.

# Apply NLP preprocessing to dictionary terms.
preproc_term_docs = [
    [token.lemma_.lower()
        for token in doc if (token.is_alpha and not token.is_stop)]
    for doc in nlp.pipe(dict_df[term_colname])
]
preproc_term_docs = ['_'.join(doc) for doc in preproc_term_docs]
# Apply NLP preprocessing to dictionary definitions.
preproc_defn_docs = [
    [token.lemma_.lower()
        for token in doc if (token.is_alpha and not token.is_stop)]
    for doc in nlp.pipe(dict_df[defn_colname])
]
# Extract the postag information as well. this is useful for WSD.
preproc_postags = dict_df[pos_colname]

# ## The following variants were added to facilitate PoS tagging for definitions.
# ## Tagging results were not very good, so it was removed later.
# # Apply NLP preprocessing to dictionary definitions.
# preproc_defn_docs = [
#     [(token.lemma_.lower(),
#         upos_code_dict.get(token.pos_.lower(), upos_code_dict["default"]))
#         for token in doc if (token.is_alpha and not token.is_stop)]
#     for doc in nlp.pipe(dict_df[defn_colname])
# ]
# # Extract the postag information as well. this is useful for WSD.
# preproc_postags = dict_df[pos_colname].apply(
#     lambda x: pos_code_dict.get(x, pos_code_dict["default"]))

# Combine terms, postags, and definitions in a table.
preproc_dict_table = [[term_doc, posname, defn_doc] for term_doc, posname, defn_doc
                      in zip(preproc_term_docs, preproc_postags, preproc_defn_docs)]


In [None]:
# Create dictionary dataframe from preprocessed dictionary table.
preproc_dict_df = pd.DataFrame(copy.deepcopy(preproc_dict_table), columns=[
                               term_colname, pos_colname, defn_colname])
# Apply dataframe preprocessing.
# Remove empty term and definition rows.
preproc_dict_df = preproc_dict_df[preproc_dict_df[term_colname].astype(bool)]
preproc_dict_df = preproc_dict_df[preproc_dict_df[defn_colname].astype(bool)]
preproc_dict_df = preproc_dict_df.reset_index(drop=True)

# ## the following two steps are now removed, since we will be using WSD now.
# # Keep first n term rows for each term, if it has multiple definitions.
# rows_to_keep_per_term = 3
# preproc_dict_df = preproc_dict_df.groupby(preproc_dict_df[term_colname]).head(rows_to_keep_per_term)
# preproc_dict_df = preproc_dict_df.reset_index()  # not necessary if groupby is done via index column.
# # Combine duplicate term rows.
# preproc_dict_df = preproc_dict_df.groupby(preproc_dict_df[term_colname]).agg({defn_colname: 'sum'})
# preproc_dict_df = preproc_dict_df.reset_index()  # not necessary if groupby is done via index column.

# add sense id column to preprocessed dictionary dataframe. useful for WSD.
preproc_dict_df.insert(loc=preproc_dict_df.columns.get_loc(defn_colname), column=senseid_colname,
                       value=preproc_dict_df.groupby(preproc_dict_df[term_colname]).cumcount())

# Print and save preprocessed dictionary dataframe.
log_obj(logging.INFO, preproc_dict_df)
preproc_dict_df.to_pickle(interm_data_dir + "preproc_dict_df.pkl")


### MRD WSD

In [None]:
preproc_dict_df = pd.read_pickle(interm_data_dir + "preproc_dict_df.pkl")

preproc_dict_df


In [None]:
display(preproc_dict_df[preproc_dict_df[term_colname] == "gain"])
display(preproc_dict_df[preproc_dict_df[term_colname] == "signal"])

display(preproc_dict_df[preproc_dict_df[term_colname] == "gain"]
        .iloc[2].definition)
display(preproc_dict_df[preproc_dict_df[term_colname] == "signal"]
        .iloc[2].definition)


In [None]:
def get_synset_name(row):
    return f"{row[term_colname]}.{pos_code_dict[row[pos_colname]]}.{row[senseid_colname]:02d}"


def custom_lesk(row):
    if row._name % 1000 == 0:
        print(f"custom_lesk: processing row: {row._name}...")

    context = row[defn_colname]
    context_wsd = []

    # print(f"{context = }")

    for token in context:
        synsets = preproc_dict_df[preproc_dict_df[term_colname] == token]

        if synsets.empty:
            continue

        # # nltk lesk implementation.
        # _, sense = max(
        #     ((len(set(context) & set(ss[defn_colname])), ss)
        #      for _, ss in synsets.iterrows()),
        #     key=lambda x: (x[0], -x[1][senseid_colname])
        # )

        # pandas lesk implementation. hopefully will be slightly faster.
        sense = synsets.loc[synsets.apply(lambda x: len(
            set(context) & set(x[defn_colname])), axis=1).idxmax(), :]

        context_wsd.append(
            f"{token}.{pos_code_dict[sense[pos_colname]]}.{sense[senseid_colname]:02d}")

    # print(f"{context_wsd = }")

    return context_wsd


# mrd_wsd_slice = slice(74661, 74673)
# mrd_wsd_slice = slice(0, 100)
mrd_wsd_slice = slice(None, None)


In [None]:
# Apply WSD to preprocessed dictionary dataframe.
# Note that this procedure takes time!
wsd_dict_df = preproc_dict_df.copy()

wsd_dict_df.loc[mrd_wsd_slice, defn_colname] = wsd_dict_df.loc[mrd_wsd_slice, :].apply(
    lambda x: custom_lesk(x), axis=1)
wsd_dict_df.loc[mrd_wsd_slice, term_colname] = wsd_dict_df.loc[mrd_wsd_slice, :].apply(
    lambda x: get_synset_name(x), axis=1)

wsd_dict_df = wsd_dict_df.drop(columns=[pos_colname, senseid_colname])

# log_obj(logging.INFO, wsd_dict_df)
wsd_dict_df.to_pickle(interm_data_dir + "wsd_dict_df.pkl")

wsd_dict_df.loc[mrd_wsd_slice, :]


In [None]:
# A simple sanity check.

wsd_dict_df = pd.read_pickle(interm_data_dir + "wsd_dict_df.pkl")

display(wsd_dict_df[wsd_dict_df[term_colname].str.startswith("gain.")])
display(wsd_dict_df[wsd_dict_df[term_colname].str.startswith("signal.")])

display(wsd_dict_df[wsd_dict_df[term_colname].str.startswith("signal.")]
        .iloc[2].definition)
display(wsd_dict_df[wsd_dict_df[term_colname].str.startswith("field.")]
        .iloc[14].definition)


### MRD2SKB_TOP

In [None]:
wsd_dict_df = pd.read_pickle(interm_data_dir + "wsd_dict_df.pkl")

log_obj(logging.INFO, wsd_dict_df)

In [None]:
# get docs and titles.
titles = copy.deepcopy(wsd_dict_df[term_colname].to_list())
docs = copy.deepcopy(wsd_dict_df[defn_colname].to_list())

# Convert a collection of text documents to a matrix of token counts.
# Note that our docs are already preprocessed and tokenized.
# Hence, we replace the analyzer with the identity function.
tcv = skl_feat_text.CountVectorizer(analyzer=lambda x: x)

preproc_dict_mtx = tcv.fit_transform(docs)
defn_words = tcv.get_feature_names_out()

# Reduce memory usage in matrix with casting.
preproc_dict_mtx = preproc_dict_mtx.astype("uint16")

# Construct index objects.
preproc_dict_words = pd.Index(titles)
preproc_defn_words = pd.Index(defn_words)

# Create dataframe.
preproc_dict_mtx_df = pd.DataFrame.sparse.from_spmatrix(
    preproc_dict_mtx, index=preproc_dict_words, columns=preproc_defn_words)

# Print the dictionary matrix dataframe.
log_obj(logging.INFO, preproc_dict_mtx_df)


In [None]:
# A simple sanity check.
test_word = 'tweet.'

preproc_dict_mtx_df.loc[preproc_dict_mtx_df.index.str.startswith(test_word)].apply(
    lambda x: list(preproc_dict_mtx_df.columns[x.values > 0]), axis=1)


In [None]:
# Create a dictionary representation of the documents.
dictionary = gensim.corpora.Dictionary(docs)

# Filter out words that occur less than 1 documents, or more than 4% of the documents.
# The following filter passes all tokens for mrd2skb. May tune it later.
dictionary.filter_extremes(no_below=1, no_above=0.04)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents: {len(corpus)}")

In [None]:
# Save corpus data to disk before training.
corpus_dir = interm_data_dir + 'corpus/'
pathlib.Path(corpus_dir).mkdir(parents=True, exist_ok=True)
gensim.corpora.MmCorpus.serialize(corpus_dir + 'corpus.gensim', corpus)
dictionary.save(corpus_dir + 'dictionary.gensim')

In [None]:
corpus = gensim.corpora.MmCorpus(corpus_dir + 'corpus.gensim')
dictionary = gensim.corpora.Dictionary.load(corpus_dir + 'dictionary.gensim')
temp = dictionary[0]  # This is only to "load" the dictionary.

In [None]:
# Set training hyperparameters.
hyperparams = {
    "top_model_name": gensim.models.LdaModel.__name__,
    "num_topics": 100,  # sememe set size is approximately 20 * num_topics
    "sememe_annotation_thrs": 0,  # unchanged, kept for compatibility reasons.
}

# Set the common arguments for each model.
top_model_args = {
    "corpus": corpus,
    "id2word": dictionary.id2token,
    "num_topics": hyperparams["num_topics"],
    # uncomment the following for LDA, comment out for pLSA.
    "alpha": "auto",
    "eta": "auto",
}

# Add model specific arguments, if applicable.
top_model_module = None
match hyperparams["top_model_name"]:
    case gensim.models.Nmf.__name__:
        top_model_module = gensim.models.Nmf
        pass
    case gensim.models.LsiModel.__name__:
        top_model_module = gensim.models.LsiModel
        pass
    case gensim.models.LdaModel.__name__:
        top_model_module = gensim.models.LdaModel
        
        if top_model_args.get("alpha", "symmetric") == "symmetric" and top_model_args.get("eta", "symmetric") == "symmetric":
            hyperparams['top_model_name'] = "PLsaModel"

        pass
    case gensim.models.EnsembleLda.__name__:
        # top_model_module = gensim.models.EnsembleLda  # UNUSED
        pass
    case gensim.models.HdpModel.__name__:
        # top_model_module = gensim.models.HdpModel  # UNUSED
        top_model_args.pop('num_topics', None)  # HDP does not take num_topics.
        pass
    case _:
        logger.info(f"Unknown topic model module: {hyperparams['top_model_name']}")

# Set directory to save the model.
model_dir = interm_data_dir + f'{hyperparams["top_model_name"]}_{hyperparams["num_topics"]}/'

In [None]:
# Train a topic model.
with timer_con():
    logger_root.setLevel(logging.DEBUG)
    logger_root.addHandler(log_fh)
    
    model = top_model_module(**top_model_args)

    logger_root.removeHandler(log_fh)

# Save model to disk.
pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)
model.save(model_dir + 'model.gensim')


In [None]:
# Reload the model.
logger_root.removeHandler(log_fh)
model = top_model_module.load(model_dir + 'model.gensim')


In [None]:
# Determine the sememe set.
sememe_set = set()
for topic_id in range(model.num_topics):
    topk = model.show_topic(topic_id, topn=20)
    topk_words = [ w for w, _ in topk ]
    
    # print(f"{topic_id}: {topk_words}")
    sememe_set.update(topk_words)

print(f"{len(sememe_set) = }")

In [None]:
# Construct the SKB matrix.
mrd2skb_mtx_df = preproc_dict_mtx_df.copy()
mrd2skb_mtx_df = mrd2skb_mtx_df[mrd2skb_mtx_df.columns.intersection(sememe_set)]

# Print and save results.
log_obj(logging.INFO, mrd2skb_mtx_df)
mrd2skb_mtx_df.to_pickle(output_data_dir + "mrd2skb_mtx_df.pkl")

### MRD2SKB Outputs

In [None]:
# For all MRD2SKB methods, mrd2skb_mtx_df should be generated.
# then, the following portions of the code will stay the same.

mrd2skb_mtx_df = pd.read_pickle(output_data_dir + "mrd2skb_mtx_df.pkl")

log_obj(logging.INFO, mrd2skb_mtx_df)


In [None]:
# Some simple sanity checks.
test_word = 'tweet.'

test_annotation = mrd2skb_mtx_df.loc[mrd2skb_mtx_df.index.str.startswith(test_word)].apply(
    lambda x: list(mrd2skb_mtx_df.columns[x.values > 0]), axis=1)

print(test_annotation)

test_word = 'screenwriter.'

test_annotation = mrd2skb_mtx_df.loc[mrd2skb_mtx_df.index.str.startswith(test_word)].apply(
    lambda x: list(mrd2skb_mtx_df.columns[x.values > 0]), axis=1)

print(test_annotation)

In [None]:
# mrd2skb output formatting functions.

def parse_sense_str(sense_str: str) -> dict:
    sense_dict_keys = [term_colname, pos_colname, senseid_colname]
    return dict(zip(sense_dict_keys, sense_str.rsplit('.', 2)))


@timer_dec
def gen_mrd2skb_list_df(mrd2skb_mtx_df, hyperparams):
    mrd2skb_bin_df = (
        mrd2skb_mtx_df > hyperparams['sememe_annotation_thrs'])
    mrd2skb_list_df = mrd2skb_bin_df.dot(
        mrd2skb_bin_df.columns + ' ').str.rstrip().str.split()
    return mrd2skb_list_df


def generate_txt_outputs(mrd2skb_mtx_df, mrd2skb_list_df):
    # save "mrd2skb_sememes.txt".
    with open(output_data_dir + "mrd2skb_sememes.txt", 'w') as fp:
        sememes = (parse_sense_str(term)[term_colname]
                   for term in mrd2skb_mtx_df.columns)
        [fp.write(f"{sememe}\n") for sememe in sorted(set(sememes))]

    # merge multiple sense definitions of the same terms.
    mrg_list_df = mrd2skb_list_df.copy()
    mrg_list_df.index = mrg_list_df.index.map(
        lambda x: parse_sense_str(x)[term_colname])
    mrg_list_df = mrg_list_df.map(
        lambda x: [parse_sense_str(term)[term_colname] for term in x])
    mrg_list_df = mrg_list_df.groupby(mrg_list_df.index).sum().apply(
        lambda x: sorted(set(x), key=x.index))

    # save "mrd2skb_skb.txt".
    with open(output_data_dir + "mrd2skb_skb.txt", 'w') as fp:
        for key, value in mrg_list_df.to_dict().items():
            if len(value) > 0:
                term = parse_sense_str(key)[term_colname]
                sememe_set = ' '.join(parse_sense_str(sememe)[term_colname]
                                      for sememe in value)
                fp.write(f"{term}\n{sememe_set}\n")


def generate_npy_outputs(mrd2skb_mtx_df, mrd2skb_list_df):
    # save "mrd2skb_sememes.npy".
    sememes = (parse_sense_str(term)[term_colname]
               for term in mrd2skb_mtx_df.columns)
    sememes_np = np.asarray(sorted(set(sememes)))
    np.save(output_data_dir + "mrd2skb_sememes.npy", sememes_np)

    # save "mrd2skb_valid_words.npy".
    valid_words = (parse_sense_str(term)[term_colname]
                   for term in mrd2skb_mtx_df.index)
    valid_words_np = np.asarray(sorted(set(valid_words)))
    np.save(output_data_dir + "mrd2skb_valid_words.npy", valid_words_np)

    # save "mrd2skb_dict.npy".
    mrg_list_df = mrd2skb_list_df.copy()

    mrg_list_df = mrg_list_df.map(
        lambda x: [parse_sense_str(term)[term_colname] for term in x])

    mrg_list_df_idx = mrd2skb_list_df.index.str.rsplit(
        '.', n=2).map(lambda x: x[0])

    mrg_list_df = mrg_list_df.groupby(mrg_list_df_idx).apply(
        lambda x: [(pos_code_inv_dict[sense.rsplit('.', 2)[1]], set(defn)) for sense, defn in zip(x.index, x)])

    mrd2skb_dict = mrg_list_df.to_dict()
    np.save(output_data_dir + "mrd2skb_dict.npy", mrd2skb_dict)


In [None]:
# Generate skb list dataframe with binary sememe annotations.
mrd2skb_list_df = gen_mrd2skb_list_df(mrd2skb_mtx_df, hyperparams)

# Print and save results.
log_obj(logging.INFO, mrd2skb_list_df)
mrd2skb_list_df.to_pickle(output_data_dir + "mrd2skb_list_df.pkl")


In [None]:
mrd2skb_list_df = pd.read_pickle(output_data_dir + "mrd2skb_list_df.pkl")

log_obj(logging.INFO, mrd2skb_list_df)


In [None]:
# generate and save skb outputs. these files are used in ThuNLP DictSKB code.
generate_txt_outputs(mrd2skb_mtx_df, mrd2skb_list_df)
generate_npy_outputs(mrd2skb_mtx_df, mrd2skb_list_df)


In [None]:
# Sanity checks for the skb file outputs.
mrd2skb_sememes = np.load(output_data_dir + "mrd2skb_sememes.npy")
mrd2skb_valid_words = np.load(output_data_dir + "mrd2skb_valid_words.npy")
mrd2skb_dict = np.load(
    output_data_dir + "mrd2skb_dict.npy", allow_pickle=True).item()

print(f"{mrd2skb_sememes.size = }")
# print(f"{mrd2skb_sememes = }")
print(f"{type(mrd2skb_sememes) = }")

print(f"{mrd2skb_valid_words.size = }")
# print(f"{mrd2skb_valid_words = }")
print(f"{type(mrd2skb_valid_words) = }")

print(f"{len(mrd2skb_dict) = }")
print(f"{mrd2skb_dict['tweet'] = }")


In [None]:
# Backup functions.

def get_hparam_hash(hyperparams):
    hparams_json = json.dumps(hyperparams).encode("utf-8")
    hparams_hash = hashlib.md5(hparams_json).hexdigest()
    return hparams_hash


def backup_data(hyperparams):
    hparam_str = "_".join(
        [f"x{idx}={val}" for idx, (key, val) in enumerate(hyperparams.items())])
    cur_backup_dirname = f"mrd2skb_top_bkp_{hparam_str}/"
    cur_backup_dir = backup_data_dir + cur_backup_dirname
    pathlib.Path(cur_backup_dir).mkdir(parents=True, exist_ok=True)

    with open(cur_backup_dir + 'hparams.json', 'w') as fp:
        json.dump(hyperparams, fp, ensure_ascii=False)

    shutil.copytree(interm_data_dir, cur_backup_dir +
                    "interm/", dirs_exist_ok=True)
    shutil.copytree(output_data_dir, cur_backup_dir +
                    "output/", dirs_exist_ok=True)


In [None]:
# Backup mrd2skb data for the current session.
backup_data(hyperparams)


### MRD2SKB Tuning

### Evaluations

To perform the evaluations, download the DictSKB repo from Github: https://github.com/thunlp/DictSKB

Prepare an appropriate conda environment for each evaluation task, as per documented in the repo itself.

Then, perform the following file substitutions to obtain the evaluation results for MRD2SKB:

- adversarial_attack/core_sememe_dict.npy <- mrd2skb_dict.npy
- consistency_check/core_sememe_dict.npy <- mrd2skb_dict.npy
- lm_sdlm/sememe_dict.uncased.npy <- mrd2skb_dict.npy

- consistency_check/dict_sememes.npy <- mrd2skb_sememes.npy
- consistency_check/dict_sememes.npy <- mrd2skb_valid_words.npy

- nli/sememe_dict.txt <- mrd2skb_skb.txt
- nli/sememes.txt <- mrd2skb_sememes.txt

In order not to change the code in DictSKB, keep the names of the files the same, just overwrite the contents.

Then, perform the evaluations as described in the DictSKB repo.

## References