From 5310973af2138d0fac9ca5460a542bfe4db3fa67 Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 17 Dec 2019 10:07:33 +0530 Subject: [PATCH 1/8] cast tensors to CUDA to work with GPU --- neuralcoref/train/model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/neuralcoref/train/model.py b/neuralcoref/train/model.py index 4cdb25b..5519f02 100644 --- a/neuralcoref/train/model.py +++ b/neuralcoref/train/model.py @@ -11,6 +11,7 @@ import torch.nn as nn import torch.utils.data + class Model(nn.Module): def __init__(self, vocab_size, embedding_dim, H1, H2, H3, D_pair_in, D_single_in, dropout=0.5): super(Model, self).__init__() @@ -70,6 +71,8 @@ def forward(self, inputs, concat_axis=1): else: spans, words, single_features = inputs words = words.type(torch.LongTensor) + if self.cuda: + words = words.cuda() embed_words = self.drop(self.word_embeds(words).view(words.size()[0], -1)) single_input = torch.cat([spans, embed_words, single_features], 1) single_scores = self.single_top(single_input) @@ -77,6 +80,9 @@ def forward(self, inputs, concat_axis=1): batchsize, pairs_num, _ = ana_spans.size() ant_words_long = ant_words.view(batchsize, -1).type(torch.LongTensor) ana_words_long = ana_words.view(batchsize, -1).type(torch.LongTensor) + if self.cuda: + ant_words_long = ant_words_long.cuda() + ana_words_long = ana_words_long.cuda() ant_embed_words = self.drop(self.word_embeds(ant_words_long).view(batchsize, pairs_num, -1)) ana_embed_words = self.drop(self.word_embeds(ana_words_long).view(batchsize, pairs_num, -1)) pair_input = torch.cat([ant_spans, ant_embed_words, ana_spans, ana_embed_words, pair_features], 2) From 5986a6e7a475893da18cf71e9eb47cb4e4d5a30a Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 17 Dec 2019 10:14:22 +0530 Subject: [PATCH 2/8] add move to data subdirectory in training.md --- neuralcoref/train/training.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/neuralcoref/train/training.md b/neuralcoref/train/training.md index 82de7fc..d0e8c6c 100644 --- a/neuralcoref/train/training.md +++ b/neuralcoref/train/training.md @@ -14,12 +14,12 @@ python -m spacy download en ```` ## Get the data -The following assumes you want to train on English, Arabic or Chinese. +The following assumes you want to train on English, Arabic or Chinese. If you want to train on another language, see the section [train on a new language](#train-on-a-new-language) below. First, download the [OntoNotes 5.0 dataset](https://catalog.ldc.upenn.edu/LDC2013T19) from LDC. -Then, download the [CoNLL-2012 skeleton files](http://conll.cemantix.org/2012/data.html) from the CoNLL 2012 shared task site, +Then, download the [CoNLL-2012 skeleton files](http://conll.cemantix.org/2012/data.html) from the CoNLL 2012 shared task site, and combine these skeleton files with the OntoNotes files to get the `*._conll` text files which can be used as inputs for the training. This can be done by executing the script [compile_coref_data.sh](/neuralcoref/train/conll_processing_script/compile_coref_data.sh) @@ -43,9 +43,9 @@ or by following these steps: * `cat conll-2012/v4/data/train/data/my_lang/annotations/*/*/*/*.v4_gold_conll >> train.my_lang.v4_gold_conll` * `cat conll-2012/v4/data/development/data/my_lang/annotations/*/*/*/*.v4_gold_conll >> dev.my_lang.v4_gold_conll` * `cat conll-2012/v4/data/test/data/my_lang/annotations/*/*/*/*.v4_gold_conll >> test.my_lang.v4_gold_conll` - + ## Prepare the data -Once you have the set of `*.v4_gold_conll` files, you can prepare the training data by running +Once you have the set of `*.v4_gold_conll` files, move these files into seperate (`train`, `test`, `dev`) subdirectories inside `data` directory. Now, you can prepare the training data by running [conllparser.py](/neuralcoref/train/conllparser.py) on each split of the data set (`train`, `test`, `dev`) as ````bash @@ -61,8 +61,8 @@ Conllparser will: - gather the mention features in a set of numpy arrays to be used as input for the neural net model. ## Train the model -Once the files have been pre-processed -(you should have a set of `*.npy` files in a sub-directory `/numpy` in each of your (`train`|`test`|`dev`) data folder), +Once the files have been pre-processed +(you should have a set of `*.npy` files in a sub-directory `/numpy` in each of your (`train`|`test`|`dev`) data folder), you can start the training process using [learn.py](/neuralcoref/train/learn.py), for example as ````bash python -m neuralcoref.train.learn --train ./data/train/ --eval ./data/dev/ @@ -73,13 +73,13 @@ There many parameters and options for the training. You can list them with the u python -m neuralcoref.train.learn --help ```` -You can follow the training by running [Tensorboard for pyTorch](https://github.com/lanpa/tensorboard-pytorch) +You can follow the training by running [Tensorboard for pyTorch](https://github.com/lanpa/tensorboard-pytorch) (it requires a version of Tensorflow, any version will be fine). Run it with `tensorboard --logdir runs`. ## Some details on the training -The model and the training as thoroughfully described in our -[very detailed blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe). -The training process is similar to the mention-ranking training described in +The model and the training as thoroughfully described in our +[very detailed blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe). +The training process is similar to the mention-ranking training described in [Clark and Manning (2016)](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf), namely: - A first step of training uses a standard cross entropy loss on the mention pair labels, - A second step of training uses a cross entropy loss on the top pairs only, and From 3f9b72ff18373c2bdec98b7b4e35133767b162eb Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 17 Dec 2019 13:58:53 +0530 Subject: [PATCH 3/8] make the data directory creation instruction obvious --- neuralcoref/train/training.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neuralcoref/train/training.md b/neuralcoref/train/training.md index d0e8c6c..7e702f7 100644 --- a/neuralcoref/train/training.md +++ b/neuralcoref/train/training.md @@ -45,13 +45,13 @@ or by following these steps: * `cat conll-2012/v4/data/test/data/my_lang/annotations/*/*/*/*.v4_gold_conll >> test.my_lang.v4_gold_conll` ## Prepare the data -Once you have the set of `*.v4_gold_conll` files, move these files into seperate (`train`, `test`, `dev`) subdirectories inside `data` directory. Now, you can prepare the training data by running +Once you have the set of `*.v4_gold_conll` files, move these files into separate (`train`, `test`, `dev`) subdirectories inside a new directory. You can use the already present `data` directory or create another directory anywhere you want. Now, you can prepare the training data by running [conllparser.py](/neuralcoref/train/conllparser.py) on each split of the data set (`train`, `test`, `dev`) as ````bash -python -m neuralcoref.train.conllparser --path ./data/train/ -python -m neuralcoref.train.conllparser --path ./data/test/ -python -m neuralcoref.train.conllparser --path ./data/dev/ +python -m neuralcoref.train.conllparser --path ./$path_to_data_directory/train/ +python -m neuralcoref.train.conllparser --path ./$path_to_data_directory/test/ +python -m neuralcoref.train.conllparser --path ./$path_to_data_directory/dev/ ```` Conllparser will: From 56ef5a14206a42eea66ca25699450852a5c08dc7 Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 24 Dec 2019 17:40:48 +0530 Subject: [PATCH 4/8] option for the user to specify the required spacy model while creating the dataset --- neuralcoref/train/conllparser.py | 47 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/neuralcoref/train/conllparser.py b/neuralcoref/train/conllparser.py index 47f0634..d1ef1ce 100644 --- a/neuralcoref/train/conllparser.py +++ b/neuralcoref/train/conllparser.py @@ -119,13 +119,13 @@ def load_file(full_name, debug=False): load a *._conll file Input: full_name: path to the file Output: list of tuples for each conll doc in the file, where the tuple contains: - (utts_text ([str]): list of the utterances in the document - utts_tokens ([[str]]): list of the tokens (conll words) in the document + (utts_text ([str]): list of the utterances in the document + utts_tokens ([[str]]): list of the tokens (conll words) in the document utts_corefs: list of coref objects (dicts) with the following properties: coref['label']: id of the coreference cluster, coref['start']: start index (index of first token in the utterance), coref['end': end index (index of last token in the utterance). - utts_speakers ([str]): list of the speaker associated to each utterances in the document + utts_speakers ([str]): list of the speaker associated to each utterances in the document name (str): name of the document part (str): part of the document ) @@ -377,11 +377,11 @@ def get_feature_array(self, doc_id, feature=None, compressed=True, debug=False): mentions_features: (N, Fs) mentions_labels: (N, 1) mentions_pairs_start_index: (N, 1) index of beggining of pair list in pair_labels - mentions_pairs_length: (N, 1) number of pairs (i.e. nb of antecedents) for each mention + mentions_pairs_length: (N, 1) number of pairs (i.e. nb of antecedents) for each mention pairs_features: (P, Fp) pairs_labels: (P, 1) pairs_ant_idx: (P, 1) => indexes of antecedents mention for each pair (mention index in doc) - """ + """ if not self.mentions: if debug: print("No mention in this doc !") return {} @@ -552,7 +552,7 @@ def list_undetected_mentions(self, data_path, save_file, debug=True): out_file.write(out_str) if debug: print(out_str) - def read_corpus(self, data_path, debug=False): + def read_corpus(self, data_path, model=None, debug=False): print("🌋 Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) @@ -595,20 +595,24 @@ def read_corpus(self, data_path, debug=False): embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]])) print("🌋 Loading spacy model") - model_options = ['en_core_web_lg', 'en_core_web_md', 'en_core_web_sm', 'en'] - model = None - for model_option in model_options: - if not model: - try: - spacy.info(model_option) - model = model_option - print("Loaded model", model_option) - except: - print("Could not detect model", model_option) - if not model: - print("Could not detect any suitable English model") - return + if model is None: + model_options = ['en_core_web_lg', 'en_core_web_md', 'en_core_web_sm', 'en'] + model = None # this declaration is redundant + for model_option in model_options: + if not model: + try: + spacy.info(model_option) + model = model_option + print("Loading model", model_option) + except: + print("Could not detect model", model_option) + if not model: + print("Could not detect any suitable English model") + return + else: + spacy.info(model) + print("Loading model", model) nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs with use_gold_mentions=" + (str(bool(self.gold_mentions)))) doc_iter = (s for s in self.utts_text) @@ -618,7 +622,7 @@ def read_corpus(self, data_path, debug=False): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens - if debug: + if debug: out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) print(out_str.encode('utf-8')) @@ -722,6 +726,7 @@ def _vocabulary_to_file(path, vocabulary): parser.add_argument('--n_jobs', type=int, default=1, help='Number of parallel jobs (default 1)') parser.add_argument('--gold_mentions', type=int, default=0, help='Use gold mentions (1) or not (0, default)') parser.add_argument('--blacklist', type=int, default=0, help='Use blacklist (1) or not (0, default)') + parser.add_argument('--spacy_model', type=str, default=None, help='spacys language model') args = parser.parse_args() if args.key is None: args.key = args.path + "/key.txt" @@ -738,7 +743,7 @@ def _vocabulary_to_file(path, vocabulary): print(file) os.remove(SAVE_DIR + file) start_time = time.time() - CORPUS.read_corpus(args.path) + CORPUS.read_corpus(args.path, model=args.spacy_model) print('=> read_corpus time elapsed', time.time() - start_time) if not CORPUS.docs: print("Could not parse any valid docs") From 0f11ad3661770287bc293a56999f7f33a4f7d4da Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 24 Dec 2019 17:41:31 +0530 Subject: [PATCH 5/8] make training work in both GPU and CPU as required --- neuralcoref/train/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralcoref/train/model.py b/neuralcoref/train/model.py index 5519f02..3f99d26 100644 --- a/neuralcoref/train/model.py +++ b/neuralcoref/train/model.py @@ -71,7 +71,7 @@ def forward(self, inputs, concat_axis=1): else: spans, words, single_features = inputs words = words.type(torch.LongTensor) - if self.cuda: + if torch.cuda.is_available(): words = words.cuda() embed_words = self.drop(self.word_embeds(words).view(words.size()[0], -1)) single_input = torch.cat([spans, embed_words, single_features], 1) @@ -80,7 +80,7 @@ def forward(self, inputs, concat_axis=1): batchsize, pairs_num, _ = ana_spans.size() ant_words_long = ant_words.view(batchsize, -1).type(torch.LongTensor) ana_words_long = ana_words.view(batchsize, -1).type(torch.LongTensor) - if self.cuda: + if torch.cuda.is_available(): ant_words_long = ant_words_long.cuda() ana_words_long = ana_words_long.cuda() ant_embed_words = self.drop(self.word_embeds(ant_words_long).view(batchsize, pairs_num, -1)) From 57d9f5ed2f71be9f01923eac71318bca2fd33730 Mon Sep 17 00:00:00 2001 From: abhijit Date: Tue, 24 Dec 2019 17:51:04 +0530 Subject: [PATCH 6/8] reduce the memory footprint of the training script. This is toggled by using --lazy flag --- neuralcoref/train/dataset.py | 45 ++++++++++++++++++++++++++++++------ neuralcoref/train/learn.py | 2 +- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/neuralcoref/train/dataset.py b/neuralcoref/train/dataset.py index db14c5a..8c71792 100644 --- a/neuralcoref/train/dataset.py +++ b/neuralcoref/train/dataset.py @@ -28,6 +28,32 @@ def load_embeddings_from_file(name): voc = [line.strip() for line in f] return embed, voc + +class _DictionaryDataLoader(object): + def __init__(self, dict_object, order): + self.dict_object = dict_object + self.order = order + + def __len__(self): + return len(self.dict_object[self.order[0]]) + + def __getitem__(self, idx): + if isinstance(idx, slice): + data = [] + for i in range(idx.start, idx.stop, idx.step if idx.step is not None else 1): + temp_data = [] + for key in self.order: + temp_data.append(self.dict_object[key][i]) + data.append(temp_data) + + else: + data = [] + for key in self.order: + data.append(self.dict_object[key][idx]) + + return data + + class NCDataset(Dataset): def __init__(self, data_path, params, no_targets=False): print("🏝 Loading Dataset at", data_path) @@ -44,13 +70,18 @@ def __init__(self, data_path, params, no_targets=False): continue numpy_files_found = True print(file_name, end=', ') - datas[file_name.split(u'.')[0]] = np.load(data_path + file_name) + datas[file_name.split(u'.')[0]] = np.load(data_path + file_name, mmap_mode="r" if params.lazy else None) if not numpy_files_found: raise ValueError("Can't find numpy files in {}".format(data_path)) # Gather arrays in two lists of tuples for mention and pairs - self.mentions = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"mentions")))) - self.pairs = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"pairs")))) + if not params.lazy: + self.mentions = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"mentions")))) + self.pairs = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"pairs")))) + else: + self.mentions = _DictionaryDataLoader(datas, order=('mentions_features', 'mentions_labels', 'mentions_pairs_length', 'mentions_pairs_start_index', 'mentions_spans', 'mentions_words')) + self.pairs = _DictionaryDataLoader(datas, order=('pairs_ant_index', 'pairs_features', 'pairs_labels')) + self.mentions_pair_length = datas[FEATURES_NAMES[2]] assert [arr.shape[0] for arr in self.mentions[0]] == [6, 1, 1, 1, 250, 8] # Cf order of FEATURES_NAMES in conllparser.py assert [arr.shape[0] for arr in self.pairs[0]] == [1, 9, 1] # Cf order of FEATURES_NAMES in conllparser.py @@ -148,7 +179,7 @@ def __getitem__(self, mention_idx, debug=False): ant_features[:, 15] = ant_features_raw[:, 2].astype(float) / ant_features_raw[:, 3].astype(float) ant_features[:, 16] = ant_features_raw[:, 4] pairs_features[:, 29:46] = ant_features - # Here we keep the genre + # Here we keep the genre ana_features = np.tile(features, (pairs_length, 1)) pairs_features[:, 46:] = ana_features @@ -213,7 +244,7 @@ def __init__(self, mentions_pairs_length, batchsize=600, shuffle=False, debug=False): """ Create and feed batches of mentions having close number of antecedents The batch are padded and collated by the padder_collate function - + # Arguments: mentions_pairs_length array of shape (N, 1): list/array of the number of pairs for each mention batchsize: Number of pairs of each batch will be capped at this @@ -232,7 +263,7 @@ def __init__(self, mentions_pairs_length, batchsize=600, num = 0 for length, mention_idx in sorted_lengths: if num > batchsize or (num == len(batch) and length != 0): # We keep the no_pairs batches pure - if debug: print("Added batch number", len(self.batches), + if debug: print("Added batch number", len(self.batches), "with", len(batch), "mentions and", num, "pairs") self.batches.append(batch) self.batches_size.append(num) # We don't count the max 7 additional mentions that are repeated @@ -281,7 +312,7 @@ def __len__(self): def padder_collate(batch, debug=False): """ Puts each data field into a tensor with outer dimension batch size - Pad variable length input tensors and add a weight tensor to the target + Pad variable length input tensors and add a weight tensor to the target """ transposed_inputs = tuple(zip(*batch)) if len(transposed_inputs) == 2: diff --git a/neuralcoref/train/learn.py b/neuralcoref/train/learn.py index ea146ab..35d9104 100644 --- a/neuralcoref/train/learn.py +++ b/neuralcoref/train/learn.py @@ -303,8 +303,8 @@ def run_epochs(start_epoch, end_epoch, loss_func, optim_func, save_name, lr, g_s parser.add_argument('--min_lr', type=float, default=2e-8, help='min learning rate') parser.add_argument('--on_eval_decrease', type=str, default='nothing', help='What to do when evaluation decreases ("nothing", "divide_lr", "next_stage", "divide_then_next")') + parser.add_argument('--lazy', action='store_true', help='Use lazy loading while loading the npy files') args = parser.parse_args() - args.costs = {'FN': args.costfn, 'FL': args.costfl, 'WL' : args.costwl } current_time = datetime.now().strftime('%b%d_%H-%M-%S') From dbd98cf2aa85534b5b5aff0f55123c4524bd858b Mon Sep 17 00:00:00 2001 From: abhijit Date: Wed, 25 Dec 2019 11:49:07 +0530 Subject: [PATCH 7/8] minor changes from code review --- neuralcoref/train/conllparser.py | 3 +-- neuralcoref/train/learn.py | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/neuralcoref/train/conllparser.py b/neuralcoref/train/conllparser.py index d1ef1ce..568f554 100644 --- a/neuralcoref/train/conllparser.py +++ b/neuralcoref/train/conllparser.py @@ -598,7 +598,6 @@ def read_corpus(self, data_path, model=None, debug=False): if model is None: model_options = ['en_core_web_lg', 'en_core_web_md', 'en_core_web_sm', 'en'] - model = None # this declaration is redundant for model_option in model_options: if not model: try: @@ -726,7 +725,7 @@ def _vocabulary_to_file(path, vocabulary): parser.add_argument('--n_jobs', type=int, default=1, help='Number of parallel jobs (default 1)') parser.add_argument('--gold_mentions', type=int, default=0, help='Use gold mentions (1) or not (0, default)') parser.add_argument('--blacklist', type=int, default=0, help='Use blacklist (1) or not (0, default)') - parser.add_argument('--spacy_model', type=str, default=None, help='spacys language model') + parser.add_argument('--spacy_model', type=str, default=None, help='model name') args = parser.parse_args() if args.key is None: args.key = args.path + "/key.txt" diff --git a/neuralcoref/train/learn.py b/neuralcoref/train/learn.py index 35d9104..910be57 100644 --- a/neuralcoref/train/learn.py +++ b/neuralcoref/train/learn.py @@ -303,10 +303,11 @@ def run_epochs(start_epoch, end_epoch, loss_func, optim_func, save_name, lr, g_s parser.add_argument('--min_lr', type=float, default=2e-8, help='min learning rate') parser.add_argument('--on_eval_decrease', type=str, default='nothing', help='What to do when evaluation decreases ("nothing", "divide_lr", "next_stage", "divide_then_next")') - parser.add_argument('--lazy', action='store_true', help='Use lazy loading while loading the npy files') + parser.add_argument('--lazy', type=int, default=1, choices=(0, 1), help='Use lazy loading (1, default) or not (0) while loading the npy files') args = parser.parse_args() args.costs = {'FN': args.costfn, 'FL': args.costfl, 'WL' : args.costwl } - + args.lazy = bool(args.lazy) + print(args.lazy) current_time = datetime.now().strftime('%b%d_%H-%M-%S') args.save_path = os.path.join(PACKAGE_DIRECTORY, 'checkpoints', current_time + '_' + socket.gethostname() + '_') From 6b5a14608461aee8f81228cb81fb3dcc865f72fb Mon Sep 17 00:00:00 2001 From: abhijit Date: Wed, 25 Dec 2019 11:56:03 +0530 Subject: [PATCH 8/8] removed print --- neuralcoref/train/learn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neuralcoref/train/learn.py b/neuralcoref/train/learn.py index 910be57..1754fa6 100644 --- a/neuralcoref/train/learn.py +++ b/neuralcoref/train/learn.py @@ -307,7 +307,6 @@ def run_epochs(start_epoch, end_epoch, loss_func, optim_func, save_name, lr, g_s args = parser.parse_args() args.costs = {'FN': args.costfn, 'FL': args.costfl, 'WL' : args.costwl } args.lazy = bool(args.lazy) - print(args.lazy) current_time = datetime.now().strftime('%b%d_%H-%M-%S') args.save_path = os.path.join(PACKAGE_DIRECTORY, 'checkpoints', current_time + '_' + socket.gethostname() + '_')