Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve temporary data management using temp directories #133

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cf452be
define a temporary directory where the data is temporarly stored
lfoppiano Mar 16, 2022
8305806
fix reformatting
lfoppiano Mar 16, 2022
65697aa
recreate model tmp directory before training if it exists
lfoppiano Mar 16, 2022
709166a
add directory management to classification
lfoppiano Mar 28, 2022
656e53e
add --output option to dump the model in a specified directory as in …
lfoppiano Mar 28, 2022
7011375
revert back the training
lfoppiano Mar 28, 2022
23c5fad
revert back another temporary change
lfoppiano Mar 28, 2022
4d57727
cleanup unused imports
lfoppiano Mar 30, 2022
6c3b169
missing temp_directory
lfoppiano Mar 30, 2022
bfc0ab7
uniform weight & configuration file names using common constants, cop…
lfoppiano Mar 30, 2022
7cc4083
Merge branch 'master' into bugfix/directory_management
lfoppiano Mar 30, 2022
fec4aae
missing import
lfoppiano Mar 30, 2022
2c75c3b
Merge branch 'master' into bugfix/directory_management
lfoppiano May 19, 2022
9471bc0
fix missing import
lfoppiano May 19, 2022
29d5f85
udpate applications
lfoppiano May 19, 2022
ae1e50e
cosmetics
lfoppiano May 19, 2022
a2db5a1
configuration of tmp_path from the resource-registry
lfoppiano May 25, 2022
cb8ce8f
pass tmp-path to the trainer and set default tmp_path for train_fold
lfoppiano May 25, 2022
5abe6c0
Merge branch 'master' into bugfix/directory_management
lfoppiano May 25, 2022
c1c6de9
Set default path to data/model in parameters
lfoppiano May 30, 2022
4b56bc1
simplification of the call to model.save() in the application scripts
lfoppiano May 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions delft/applications/citationClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import argparse
import time
from delft.textClassification.models import architectures
from delft.utilities.misc import DEFAULT_DATA_MODEL_PATH_TEXT_CLASSIFICATION

list_classes = [
"negative",
Expand Down Expand Up @@ -34,7 +35,7 @@ def configure(architecture):
return batch_size, maxlen, patience, early_stop, max_epoch


def train(embeddings_name, fold_count, architecture="gru", transformer=None):
def train(embeddings_name, fold_count, architecture="gru", transformer=None, output_directory=None):
batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture)

model = Classifier('citations_'+architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count,
Expand All @@ -48,11 +49,12 @@ def train(embeddings_name, fold_count, architecture="gru", transformer=None):
model.train(xtr, y)
else:
model.train_nfold(xtr, y)

# saving the model
model.save()
model.save(output_directory)


def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None):
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None, output_directory=None):
batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture)

model = Classifier('citations_'+architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count,
Expand All @@ -69,9 +71,8 @@ def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=
model.train(x_train, y_train)
else:
model.train_nfold(x_train, y_train)

# saving the model
model.save()

model.save(output_directory)

model.eval(x_test, y_test)

Expand Down Expand Up @@ -118,6 +119,7 @@ def classify(texts, output_format, architecture="gru", embeddings_name=None, tra
"HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \
"for model names"
)
parser.add_argument("--output", help="Directory where to save a trained model.", default=DEFAULT_DATA_MODEL_PATH_TEXT_CLASSIFICATION)

args = parser.parse_args()

Expand All @@ -126,26 +128,27 @@ def classify(texts, output_format, architecture="gru", embeddings_name=None, tra

embeddings_name = args.embedding
transformer = args.transformer
output = args.output

architecture = args.architecture
if architecture not in architectures:
print('unknown model architecture, must be one of '+str(architectures))

if transformer == None and embeddings_name == None:
if transformer is None and embeddings_name is None:
# default word embeddings
embeddings_name = "glove-840B"

if args.action == 'train':
if args.fold_count < 1:
raise ValueError("fold-count should be equal or more than 1")

train(embeddings_name, args.fold_count, architecture=architecture, transformer=transformer)
train(embeddings_name, args.fold_count, architecture=architecture, transformer=transformer, output_directory=output)

if args.action == 'train_eval':
if args.fold_count < 1:
raise ValueError("fold-count should be equal or more than 1")

y_test = train_and_eval(embeddings_name, args.fold_count, architecture=architecture, transformer=transformer)
y_test = train_and_eval(embeddings_name, args.fold_count, architecture=architecture, transformer=transformer, output_directory=output)

if args.action == 'classify':
someTexts = ['One successful strategy [15] computes the set-similarity involving (multi-word) keyphrases about the mentions and the entities, collected from the KG.',
Expand Down
53 changes: 29 additions & 24 deletions delft/applications/dataseerClassifier.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import json
from delft.utilities.Embeddings import Embeddings
from delft.utilities.Utilities import split_data_and_labels
from delft.textClassification.reader import load_dataseer_corpus_csv
from delft.textClassification.reader import vectorize as vectorizer
import delft.textClassification
from delft.textClassification import Classifier
import argparse
import time
from delft.textClassification.models import architectures
import numpy as np

from delft.utilities.misc import DEFAULT_DATA_MODEL_PATH_TEXT_CLASSIFICATION

"""
Classifier for deciding if a sentence introduce a dataset or not, and prediction of the
dataset type.
Expand All @@ -33,7 +33,7 @@ def configure(architecture):
return batch_size, maxlen, patience, early_stop, max_epoch


def train(embeddings_name, fold_count, architecture="gru", transformer=None, cascaded=False):
def train(embeddings_name, fold_count, architecture="gru", transformer=None, cascaded=False, output_directory=None):
print('loading binary dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-binary.csv")

Expand All @@ -50,8 +50,8 @@ def train(embeddings_name, fold_count, architecture="gru", transformer=None, cas
model.train(xtr, y)
else:
model.train_nfold(xtr, y)
# saving the model
model.save()

model.save(output_directory)

print('loading reuse dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-reuse.csv")
Expand All @@ -67,8 +67,8 @@ def train(embeddings_name, fold_count, architecture="gru", transformer=None, cas
model.train(xtr, y)
else:
model.train_nfold(xtr, y)
# saving the model
model.save()

model.save(output_directory)

print('loading first-level dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-1.csv")
Expand All @@ -85,9 +85,10 @@ def train(embeddings_name, fold_count, architecture="gru", transformer=None, cas
model.train(xtr, y)
else:
model.train_nfold(xtr, y)

# saving the model
model.save()
model.save(output_directory)

'''
print('training second-level dataset subtype corpus...')
xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-multilevel.csv")
Expand Down Expand Up @@ -128,23 +129,23 @@ def train(embeddings_name, fold_count, architecture="gru", transformer=None, cas
model.save()
'''

def train_and_eval(embeddings_name=None, fold_count=1, architecture="gru", transformer=None, cascaded=False):
def train_and_eval(embeddings_name=None, fold_count=1, architecture="gru", transformer=None, cascaded=False, output_directory=None):
if cascaded:
return train_eval_cascaded(embeddings_name, fold_count, architecture=architecture, transformer=transformer)
return train_eval_cascaded(embeddings_name, fold_count, architecture=architecture, transformer=transformer, output_directory=output_directory)

# classifier for deciding if we have a dataset or not in a sentence
train_and_eval_binary(embeddings_name, fold_count, architecture=architecture, transformer=transformer)
train_and_eval_binary(embeddings_name, fold_count, architecture=architecture, transformer=transformer, output_directory=output_directory)

# classifier for deciding if the introduced dataset is a reuse of an existing one or is a new dataset
train_and_eval_reuse(embeddings_name, fold_count, architecture=architecture, transformer=transformer)
train_and_eval_reuse(embeddings_name, fold_count, architecture=architecture, transformer=transformer, output_directory=output_directory)

# classifier for first level data type hierarchy
train_and_eval_primary(embeddings_name, fold_count, architecture=architecture, transformer=transformer)
train_and_eval_primary(embeddings_name, fold_count, architecture=architecture, transformer=transformer, output_directory=output_directory)

# classifier for second level data type hierarchy (subtypes)
#train_and_eval_secondary(embeddings_name, fold_count, architecture=architecture, transformer=transformer)

def train_and_eval_binary(embeddings_name, fold_count, architecture="gru", transformer=None):
def train_and_eval_binary(embeddings_name, fold_count, architecture="gru", transformer=None, output_directory=None):
print('loading dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-binary.csv")

Expand Down Expand Up @@ -179,9 +180,9 @@ def train_and_eval_binary(embeddings_name, fold_count, architecture="gru", trans
model.eval(x_test, y_test)

# saving the model
model.save()
model.save(output_directory)

def train_and_eval_reuse(embeddings_name, fold_count, architecture="gru", transformer=None):
def train_and_eval_reuse(embeddings_name, fold_count, architecture="gru", transformer=None, output_directory=None):
print('loading dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-reuse.csv")

Expand Down Expand Up @@ -216,9 +217,9 @@ def train_and_eval_reuse(embeddings_name, fold_count, architecture="gru", transf
model.eval(x_test, y_test)

# saving the model
model.save()
def train_and_eval_primary(embeddings_name, fold_count, architecture="gru", transformer=None):
model.save(output_directory)

def train_and_eval_primary(embeddings_name, fold_count, architecture="gru", transformer=None, output_directory=None):
print('loading dataset type corpus...')
xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv("data/textClassification/dataseer/all-multilevel.csv")

Expand Down Expand Up @@ -252,7 +253,7 @@ def train_and_eval_primary(embeddings_name, fold_count, architecture="gru", tran
model.eval(x_test, y_test)

# saving the model
model.save()
model.save(output_directory)

def train_and_eval_secondary(embeddings_name, fold_count, architecture="gru", transformer=None):
print('training second-level dataset subtype corpus...')
Expand Down Expand Up @@ -506,6 +507,7 @@ def build_prior_class_distribution():
"HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \
"for model names"
)
parser.add_argument("--output", help="Directory where to save a trained model.", default=DEFAULT_DATA_MODEL_PATH_TEXT_CLASSIFICATION)

args = parser.parse_args()

Expand All @@ -515,25 +517,28 @@ def build_prior_class_distribution():
embeddings_name = args.embedding
cascaded = args.cascaded
transformer = args.transformer
output = args.output

architecture = args.architecture
if architecture not in architectures:
print('unknown model architecture, must be one of '+str(architectures))

if transformer == None and embeddings_name == None:
if transformer is None and embeddings_name is None:
# default word embeddings
embeddings_name = "glove-840B"

if args.action == 'train':
if args.fold_count < 1:
raise ValueError("fold-count should be equal or more than 1")

train(embeddings_name=embeddings_name, fold_count=args.fold_count, architecture=architecture, transformer=transformer, cascaded=cascaded)
train(embeddings_name=embeddings_name, fold_count=args.fold_count, architecture=architecture,
transformer=transformer, cascaded=cascaded, output_directory=output)

if args.action == 'train_eval':
if args.fold_count < 1:
raise ValueError("fold-count should be equal or more than 1")
y_test = train_and_eval(embeddings_name=embeddings_name, fold_count=args.fold_count, architecture=architecture, transformer=transformer, cascaded=cascaded)
y_test = train_and_eval(embeddings_name=embeddings_name, fold_count=args.fold_count, architecture=architecture,
transformer=transformer, cascaded=cascaded, output_directory=output)

if args.action == 'classify':
someTexts = ['Labeling yield and radiochemical purity was analyzed by instant thin layered chromatography (ITLC).',
Expand Down
10 changes: 4 additions & 6 deletions delft/applications/datasetTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from delft.sequenceLabelling import Sequence
from delft.sequenceLabelling.reader import load_data_and_labels_json_offsets
from delft.utilities.misc import parse_number_ranges
from delft.utilities.misc import DEFAULT_DATA_MODEL_PATH_SEQUENCE_LABELLING


def configure(architecture, output_path=None, max_sequence_length=-1, batch_size=-1, embeddings_name=None, max_epoch=-1, use_ELMo=False):
"""
Expand Down Expand Up @@ -94,10 +95,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
print("training runtime: %s seconds " % runtime)

# saving the model
if output_path:
model.save(output_path)
else:
model.save()
model.save(output_path)


# split data, train a model and evaluate it
Expand Down Expand Up @@ -245,7 +243,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
"HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \
"for model names"
)
parser.add_argument("--output", help="Directory where to save a trained model.")
parser.add_argument("--output", help="Directory where to save a trained model.", default=DEFAULT_DATA_MODEL_PATH_SEQUENCE_LABELLING)
parser.add_argument("--input", help="Grobid data file to be used for training (train action), for training and " +
"evaluation (train_eval action) or just for evaluation (eval action).")
parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.")
Expand Down
19 changes: 7 additions & 12 deletions delft/applications/grobidTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@

from delft.sequenceLabelling import Sequence
from delft.sequenceLabelling.reader import load_data_and_labels_crf_file
from delft.sequenceLabelling.reader import load_data_crf_string
from delft.utilities.misc import parse_number_ranges
from delft.utilities.misc import DEFAULT_DATA_MODEL_PATH_SEQUENCE_LABELLING

MODEL_LIST = ['affiliation-address', 'citation', 'date', 'header', 'name-citation', 'name-header', 'software', 'figure', 'table', 'reference-segmenter']

Expand Down Expand Up @@ -149,10 +148,7 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
print("training runtime: %s seconds " % (runtime))

# saving the model
if output_path:
model.save(output_path)
else:
model.save()
model.save(output_path)


# split data, train a GROBID model and evaluate it
Expand Down Expand Up @@ -209,10 +205,7 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
model.eval(x_eval, y_eval, features=f_eval)

# saving the model (must be called after eval for multiple fold training)
if output_path:
model.save(output_path)
else:
model.save()
model.save(output_path)


# split data, train a GROBID model and evaluate it
Expand Down Expand Up @@ -272,6 +265,7 @@ def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', featu

return annotations


class Tasks:
TRAIN = 'train'
TRAIN_EVAL = 'train_eval'
Expand Down Expand Up @@ -302,7 +296,8 @@ class Tasks:
parser.add_argument("action", choices=actions)
parser.add_argument("--fold-count", type=int, default=1, help="Number of fold to use when evaluating with n-fold "
"cross validation.")
parser.add_argument("--architecture", help="Type of model architecture to be used, one of "+str(architectures))
parser.add_argument("--architecture", choices=architectures, help="Type of model architecture to be used",
required=True)
parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings")

# group_embeddings = parser.add_mutually_exclusive_group(required=False)
Expand All @@ -324,7 +319,7 @@ class Tasks:
"HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \
"for model names"
)
parser.add_argument("--output", help="Directory where to save a trained model.")
parser.add_argument("--output", help="Directory where to save a trained model.", default=DEFAULT_DATA_MODEL_PATH_SEQUENCE_LABELLING)
parser.add_argument("--input", help="Grobid data file to be used for training (train action), for training and " +
"evaluation (train_eval action) or just for evaluation (eval action).")
parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.")
Expand Down
Loading