In [2]:
import numpy as np
import random as rn

# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/fchollet/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '01'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(421)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(123451)


import logging

import numpy as np
from keras import layers
from keras.models import Model, load_model

from data.datasets import *
from eval import keras_metrics, metrics
from nlp import tokenizer as tk
from utils import info, preprocessing, postprocessing, plots

In [3]:

# LOGGING CONFIGURATION

logging.basicConfig(
    format='%(asctime)s\t%(levelname)s\t%(message)s',
    level=logging.DEBUG)

info.log_versions()

# END LOGGING CONFIGURATION

# GLOBAL VARIABLES

SAVE_MODEL = False
MODEL_PATH = "models/mergernn.h5"
SHOW_PLOTS = False

# END GLOBAL VARIABLES

# Dataset and hyperparameters for each dataset

DATASET = Hulth

if DATASET == Semeval2017:
    tokenizer = tk.tokenizers.nltk
    DATASET_FOLDER = "data/Semeval2017"
    MAX_DOCUMENT_LENGTH = 400
    MAX_VOCABULARY_SIZE = 20000
    EMBEDDINGS_SIZE = 300
    BATCH_SIZE = 32
    EPOCHS = 10
elif DATASET == Hulth:
    tokenizer = tk.tokenizers.nltk
    DATASET_FOLDER = "data/Hulth2003"
    MAX_DOCUMENT_LENGTH = 800
    MAX_VOCABULARY_SIZE = 20000
    EMBEDDINGS_SIZE = 300
    BATCH_SIZE = 32
    EPOCHS = 41
else:
    raise NotImplementedError("Can't set the hyperparameters: unknown dataset")


# END PARAMETERS

logging.info("Loading dataset...")

data = DATASET(DATASET_FOLDER)

train_doc_str, train_answer_str = data.load_train()
test_doc_str, test_answer_str = data.load_test()
val_doc_str, val_answer_str = data.load_validation()

train_doc, train_answer = tk.tokenize_set(train_doc_str,train_answer_str,tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str,test_answer_str,tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str,val_answer_str,tokenizer)

# Sanity check
# logging.info("Sanity check: %s",metrics.precision(test_answer,test_answer))

logging.info("Dataset loaded. Preprocessing data...")

2021-12-12 04:17:48,940	INFO	Keras version 2.7.0
2021-12-12 04:17:48,942	INFO	Numpy version 1.20.1
2021-12-12 04:17:48,943	INFO	Tensorflow version 2.7.0
2021-12-12 04:17:48,945	INFO	Loading dataset...
2021-12-12 04:17:48,945	DEBUG	Initialized dataset Hulth, 2003 from folder data/Hulth2003
2021-12-12 04:17:49,760	DEBUG	Loaded training set for dataset Hulth, 2003
2021-12-12 04:17:50,141	DEBUG	Loaded test set for dataset Hulth, 2003
2021-12-12 04:17:50,529	DEBUG	Loaded validation set for dataset Hulth, 2003
2021-12-12 04:17:53,126	INFO	Dataset loaded. Preprocessing data...


In [4]:
train_x,train_y,test_x,test_y,val_x,val_y,embedding_matrix = preprocessing.\
    prepare_sequential(train_doc, train_answer, test_doc, test_answer,val_doc,val_answer,
                       max_document_length=MAX_DOCUMENT_LENGTH,
                       max_vocabulary_size=MAX_VOCABULARY_SIZE,
                       embeddings_size=EMBEDDINGS_SIZE)

# weigh training examples: everything that's not class 0 (not kp)
# gets a heavier score
#train_y_weights = np.argmax(train_y,axis=2) # this removes the one-hot representation
#train_y_weights[train_y_weights > 0] = 20
#train_y_weights[train_y_weights < 1] = 1

from sklearn.utils import class_weight
train_y_weights = np.argmax(train_y, axis=2)
train_y_weights = np.reshape(class_weight.compute_sample_weight('balanced', train_y_weights.flatten()),
                             np.shape(train_y_weights))


logging.info("Data preprocessing complete.")
logging.info("Maximum possible recall: %s",
             metrics.recall(test_answer,
                               postprocessing.get_words(test_doc,postprocessing.undo_sequential(test_y))))

2021-12-12 04:17:53,339	DEBUG	Fitting dictionary on 2000 documents...
2021-12-12 04:17:53,422	DEBUG	Dictionary fitting completed. Found 18312 unique tokens
2021-12-12 04:17:53,499	DEBUG	Longest training document : 556 tokens
2021-12-12 04:17:53,500	DEBUG	Longest test document :     380 tokens
2021-12-12 04:17:53,500	DEBUG	Longest validation document : 314 tokens
2021-12-12 04:17:53,743	DEBUG	Training set samples size   : (1000, 800)
2021-12-12 04:17:53,744	DEBUG	Training set answers size   : (1000, 800, 3)
2021-12-12 04:17:53,744	DEBUG	Test set samples size       : (500, 800)
2021-12-12 04:17:53,745	DEBUG	Test set answers size       : (500, 800, 3) 
2021-12-12 04:17:53,745	DEBUG	Validation set samples size : (500, 800)
2021-12-12 04:17:53,745	DEBUG	Validation set answers size : (500, 800, 3) 
2021-12-12 04:17:53,746	DEBUG	Loading GloVe pre-trained embeddings from glove.6B.300d.txt
2021-12-12 04:18:12,799	DEBUG	Total embeddings found: 400000.
2021-12-12 04:18:12,800	DEBUG	Building embed

In [5]:
MODEL_PATH = './model/new_kp20k_mergebigru_1_8_epoch_10'

logging.info("Loading existing model from %s...",MODEL_PATH)
model = load_model(MODEL_PATH)
logging.info("Completed loading model from file")


logging.info("Predicting on test set...")
output = model.predict(x=[test_x,test_x], verbose=1)
logging.debug("Shape of output array: %s",np.shape(output))

obtained_tokens = postprocessing.undo_sequential(output)
obtained_words = postprocessing.get_words(test_doc,obtained_tokens)

precision = metrics.precision(test_answer,obtained_words)
recall = metrics.recall(test_answer,obtained_words)
f1 = metrics.f1(precision,recall)

print("###    Obtained Scores    ###")
print("###     (full dataset)    ###")
print("###")
print("### Precision : %.4f" % precision)
print("### Recall    : %.4f" % recall)
print("### F1        : %.4f" % f1)
print("###                       ###")

keras_precision = keras_metrics.keras_precision(test_y,output)
keras_recall = keras_metrics.keras_recall(test_y,output)
keras_f1 = keras_metrics.keras_f1(test_y,output)

print("###    Obtained Scores    ###")
print("###    (fixed dataset)    ###")
print("###")
print("### Precision : %.4f" % keras_precision)
print("### Recall    : %.4f" % keras_recall)
print("### F1        : %.4f" % keras_f1)
print("###                       ###")

clean_words = postprocessing.get_valid_patterns(obtained_words)

precision = metrics.precision(test_answer,clean_words)
recall = metrics.recall(test_answer,clean_words)
f1 = metrics.f1(precision,recall)

print("###    Obtained Scores    ###")
print("### (full dataset,        ###")
print("###  pos patterns filter) ###")
print("###")
print("### Precision : %.4f" % precision)
print("### Recall    : %.4f" % recall)
print("### F1        : %.4f" % f1)
print("###                       ###")


obtained_words_top = postprocessing.get_top_words(test_doc, output, 5)

precision_top = metrics.precision(test_answer, obtained_words_top)
recall_top = metrics.recall(test_answer, obtained_words_top)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 5) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 10)

precision_top = metrics.precision(test_answer, obtained_words_top)
recall_top = metrics.recall(test_answer, obtained_words_top)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 10)###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 15)

precision_top = metrics.precision(test_answer, obtained_words_top)
recall_top = metrics.recall(test_answer, obtained_words_top)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 15)###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")


print("###                       ###")
print("###                       ###")
print("###       STEMMING        ###")
print("###                       ###")
print("###                       ###")

STEM_MODE = metrics.stemMode.both

precision = metrics.precision(test_answer, obtained_words,STEM_MODE)
recall = metrics.recall(test_answer, obtained_words,STEM_MODE)
f1 = metrics.f1(precision, recall)

print("###    Obtained Scores    ###")
print("###     (full dataset)    ###")
print("###")
print("### Precision : %.4f" % precision)
print("### Recall    : %.4f" % recall)
print("### F1        : %.4f" % f1)
print("###                       ###")

keras_precision = keras_metrics.keras_precision(test_y, output)
keras_recall = keras_metrics.keras_recall(test_y, output)
keras_f1 = keras_metrics.keras_f1(test_y, output)

print("###    Obtained Scores    ###")
print("###    (fixed dataset)    ###")
print("###")
print("### Precision : %.4f" % keras_precision)
print("### Recall    : %.4f" % keras_recall)
print("### F1        : %.4f" % keras_f1)
print("###                       ###")

clean_words = postprocessing.get_valid_patterns(obtained_words)

precision = metrics.precision(test_answer, clean_words,STEM_MODE)
recall = metrics.recall(test_answer, clean_words,STEM_MODE)
f1 = metrics.f1(precision, recall)

print("###    Obtained Scores    ###")
print("### (full dataset,        ###")
print("###  pos patterns filter) ###")
print("###")
print("### Precision : %.4f" % precision)
print("### Recall    : %.4f" % recall)
print("### F1        : %.4f" % f1)
print("###                       ###")



obtained_words_top = postprocessing.get_top_words(test_doc, output, 5)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 5) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 10)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 10)###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 15)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 15)###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

2021-12-12 04:18:13,289	INFO	Loading existing model from ./model/new_kp20k_mergebigru_1_8_epoch_10...
2021-12-12 04:18:25,190	INFO	Completed loading model from file
2021-12-12 04:18:25,190	INFO	Predicting on test set...




2021-12-12 04:18:38,938	DEBUG	Shape of output array: (500, 800, 3)


###    Obtained Scores    ###
###     (full dataset)    ###
###
### Precision : 0.0817
### Recall    : 0.0468
### F1        : 0.0595
###                       ###
###    Obtained Scores    ###
###    (fixed dataset)    ###
###
### Precision : 0.0823
### Recall    : 0.0529
### F1        : 0.0644
###                       ###
###    Obtained Scores    ###
### (full dataset,        ###
###  pos patterns filter) ###
###
### Precision : 0.1100
### Recall    : 0.0495
### F1        : 0.0682
###                       ###
###    Obtained Scores    ###
### (full dataset, top 5) ###
###
### Precision : 0.1036
### Recall    : 0.0401
### F1        : 0.0578
###                       ###
###    Obtained Scores    ###
### (full dataset, top 10)###
###
### Precision : 0.0900
### Recall    : 0.0462
### F1        : 0.0611
###                       ###
###    Obtained Scores    ###
### (full dataset, top 15)###
###
### Precision : 0.0857
### Recall    : 0.0466
### F1        : 0.0604
###                   

In [6]:
obtained_words_top = postprocessing.get_top_words(test_doc, output, 3)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 3) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

###    Obtained Scores    ###
### (full dataset, top 3) ###
###
### Precision : 0.1161
### Recall    : 0.0313
### F1        : 0.0494
###                       ###


In [7]:
obtained_words_top

{'2007': [['trauma'], ['trauma', 'research'], ['medicine', 'visible']],
 '2042': [['two-dimensional', 'and'],
  ['instability', '-'],
  ['space', 'plasmas']],
 '308': [['applet'], ['connectivity'], ['individual', 'pedagogical']],
 '215': [['decision', 'maker'], ['ten'], ['illustration']],
 '250': [['much'], ['easier'], ['enterprise']],
 '1996': [['synthetic', 'images', 'based'],
  ['through', 'three', 'metrics'],
  ['pixel', 'metric', 'which']],
 '2162': [['>', '/sup'], ['free', 'product'], ['zfc', 'using', 'pcf']],
 '2127': [['constructive', 'system'], ['identities'], ['illustrative']],
 '290': [['pole', 'placement'], ['gimbal'], ['microactuator']],
 '1956': [['dynamical', 'transition'],
  ['one', 'period', ')'],
  ['period', 'n']],
 '228': [['capillary', 'gel'],
  ['subsequent', 'ultrathin-layer', 'gel'],
  ['ultrathin-layer', 'gel', 'electrophoresis', 'analysis']],
 '370': [['mentioned'], ['national', 'governments'], ['real', 'laws']],
 '2082': [['manner', 'through'],
  ['strategic'

In [9]:
output.shape

(500, 800, 3)

In [11]:
output[0][0]

array([0.9821014 , 0.00326481, 0.01463387], dtype=float32)

In [12]:
obtained_words_top = postprocessing.get_top_words(test_doc, output, 3)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 3) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 5)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 5) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

obtained_words_top = postprocessing.get_top_words(test_doc, output, 10)

precision_top = metrics.precision(test_answer, obtained_words_top,STEM_MODE)
recall_top = metrics.recall(test_answer, obtained_words_top,STEM_MODE)
f1_top = metrics.f1(precision_top, recall_top)

print("###    Obtained Scores    ###")
print("### (full dataset, top 10) ###")
print("###")
print("### Precision : %.4f" % precision_top)
print("### Recall    : %.4f" % recall_top)
print("### F1        : %.4f" % f1_top)
print("###                       ###")

###    Obtained Scores    ###
### (full dataset, top 3) ###
###
### Precision : 0.1161
### Recall    : 0.0313
### F1        : 0.0494
###                       ###
###    Obtained Scores    ###
### (full dataset, top 5) ###
###
### Precision : 0.1106
### Recall    : 0.0427
### F1        : 0.0617
###                       ###
###    Obtained Scores    ###
### (full dataset, top 10) ###
###
### Precision : 0.0967
### Recall    : 0.0495
### F1        : 0.0654
###                       ###
