Skip to content

Commit

Permalink
Update training docs and code (#211)
Browse files Browse the repository at this point in the history
* Updates code to work with PyTorch 1.3.0
* Fixing paths after moving a lot of code from root neuralcoref to subdir training
* Fixed reference to self._extract_mentions in conllparser.py
* Try all english models
* Setting blacklist and use_gold_mentions as script parameters
* training.md : step-by-step details on processing CONLL & OntoNotes data
  • Loading branch information
svlandeg committed Oct 22, 2019
1 parent 926f8c4 commit 70cab1d
Show file tree
Hide file tree
Showing 14 changed files with 161 additions and 117 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ neuralcoref/*.html
Profile.prof
.vscode
.sass-cache
.idea/*

# Python
/.Python
Expand Down
4 changes: 2 additions & 2 deletions neuralcoref/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import warnings
warnings.filterwarnings("ignore", message="spacy.strings.StringStore size changed")

from .neuralcoref import NeuralCoref
from .file_utils import NEURALCOREF_MODEL_URL, NEURALCOREF_MODEL_PATH, NEURALCOREF_CACHE, cached_path
from neuralcoref.neuralcoref import NeuralCoref
from neuralcoref.file_utils import NEURALCOREF_MODEL_URL, NEURALCOREF_MODEL_PATH, NEURALCOREF_CACHE, cached_path

__all__ = ['NeuralCoref', 'add_to_pipe']
__version__ = "4.0.0"
Expand Down
1 change: 0 additions & 1 deletion neuralcoref/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open

import boto3
Expand Down
7 changes: 3 additions & 4 deletions neuralcoref/train/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
from __future__ import unicode_literals
from __future__ import print_function

import sys
import os
import spacy
import numpy as np

from neuralcoref.utils import PACKAGE_DIRECTORY, SIZE_PAIR_IN, SIZE_SINGLE_IN
from neuralcoref.compat import unicode_
from neuralcoref.document import Document, MENTION_TYPE, NO_COREF_LIST
from neuralcoref.train.utils import PACKAGE_DIRECTORY, SIZE_SINGLE_IN
from neuralcoref.train.compat import unicode_
from neuralcoref.train.document import Document, MENTION_TYPE, NO_COREF_LIST

#######################
##### UTILITIES #######
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
ONTONOTES_PATH=$1

if [ ! -n "$ONTONOTES_PATH" ] ; then
echo "USAGE: ./scripts/compile_coref_data.sh /path/to/ontonotes/data"
echo "USAGE: ./compile_coref_data.sh /path/to/ontonotes/data"
exit 1
fi

Expand Down
103 changes: 61 additions & 42 deletions neuralcoref/train/conllparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,22 @@
from __future__ import print_function

import re
import sys
import codecs
import argparse
import time
import os
import io
import pickle

import spacy
from spacy.tokens import Doc

import numpy as np

from tqdm import tqdm

from .compat import unicode_
from .document import Mention, Document, Speaker, EmbeddingExtractor, MISSING_WORD
from .utils import parallel_process
from neuralcoref.train.compat import unicode_
from neuralcoref.train.document import Mention, Document, Speaker, EmbeddingExtractor, MISSING_WORD, \
extract_mentions_spans
from neuralcoref.train.utils import parallel_process

PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
REMOVED_CHAR = ["/", "%", "*"]
Expand Down Expand Up @@ -290,7 +288,7 @@ def get_conll_spacy_lookup(self, conll_tokens, spacy_tokens, debug=False):
lookup.append(c_lookup)
return lookup

def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_mentions=False, debug=False):
def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_mentions, debug=False):
conll_lookup = self.get_conll_spacy_lookup(tokens, parsed)
self.conll_tokens.append(tokens)
self.conll_lookup.append(conll_lookup)
Expand All @@ -308,14 +306,16 @@ def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_menti
self.speakers[speaker_id] = Speaker(speaker_id, speaker_name)
if use_gold_mentions:
for coref in corefs:
# print("coref['label']", coref['label'])
# print("coref text",parsed[coref['start']:coref['end']+1])
# print("coref['label']", coref['label'])
# print("coref text",parsed[coref['start']:coref['end']+1])
mention = Mention(parsed[coref['start']:coref['end']+1], len(self.mentions), len(self.utterances),
self.n_sents, speaker=self.speakers[speaker_id], gold_label=coref['label'])
self.mentions.append(mention)
# print("mention: ", mention, "label", mention.gold_label)
# print("mention: ", mention, "label", mention.gold_label)
else:
self._extract_mentions(parsed, len(self.utterances), self.n_sents, self.speakers[speaker_id])
mentions_spans = extract_mentions_spans(doc=parsed, blacklist=self.blacklist)
self._process_mentions(mentions_spans, len(self.utterances), self.n_sents, self.speakers[speaker_id])

# Assign a gold label to mentions which have one
if debug: print("Check corefs", corefs)
for i, coref in enumerate(corefs):
Expand Down Expand Up @@ -369,7 +369,7 @@ def get_pair_mentions_features_conll(self, m1, m2, compressed=True):
]
return feat_l

def get_feature_array(self, doc_id, feature=None, compressed=True, debug=True):
def get_feature_array(self, doc_id, feature=None, compressed=True, debug=False):
"""
Prepare feature array:
mentions_spans: (N, S)
Expand All @@ -383,7 +383,7 @@ def get_feature_array(self, doc_id, feature=None, compressed=True, debug=True):
pairs_ant_idx: (P, 1) => indexes of antecedents mention for each pair (mention index in doc)
"""
if not self.mentions:
print("No mention in this doc !")
if debug: print("No mention in this doc !")
return {}
if debug: print("🛎 features matrices")
mentions_spans = []
Expand Down Expand Up @@ -447,7 +447,7 @@ def get_feature_array(self, doc_id, feature=None, compressed=True, debug=True):
###################
### ConllCorpus #####
class ConllCorpus(object):
def __init__(self, n_jobs=4, embed_path=PACKAGE_DIRECTORY+"/weights/", use_gold_mentions=False):
def __init__(self, n_jobs=4, embed_path=PACKAGE_DIRECTORY+"/weights/", gold_mentions=False, blacklist=False):
self.n_jobs = n_jobs
self.features = {}
self.utts_text = []
Expand All @@ -461,7 +461,8 @@ def __init__(self, n_jobs=4, embed_path=PACKAGE_DIRECTORY+"/weights/", use_gold_
self.embed_extractor = EmbeddingExtractor(embed_path)
self.trainable_embed = []
self.trainable_voc = []
self.use_gold_mentions = use_gold_mentions
self.gold_mentions = gold_mentions
self.blacklist = blacklist

def check_words_in_embeddings_voc(self, embedding, tuned=True, debug=False):
print("🌋 Checking if words are in embedding voc")
Expand Down Expand Up @@ -569,8 +570,8 @@ def read_corpus(self, data_path, debug=False):
doc_list = parallel_process(cleaned_file_list, load_file)
for docs in doc_list:#executor.map(self.load_file, cleaned_file_list):
for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs:
print("Imported", name)
if debug:
print("Imported", name)
print("utts_text", utts_text)
print("utt_tokens", utt_tokens)
print("utts_corefs", utts_corefs)
Expand All @@ -590,19 +591,26 @@ def read_corpus(self, data_path, debug=False):
print("🌋 Building docs")
for name, part in self.docs_names:
self.docs.append(ConllDoc(name=name, part=part, nlp=None,
blacklist=False, consider_speakers=True,
blacklist=self.blacklist, consider_speakers=True,
embedding_extractor=self.embed_extractor,
conll=CONLL_GENRES[name[:2]]))
print("🌋 Loading spacy model")
try:
spacy.info('en_core_web_sm')
model = 'en_core_web_sm'
except IOError:
print("No spacy 2 model detected, using spacy1 'en' model")
spacy.info('en')
model = 'en'
model_options = ['en_core_web_lg', 'en_core_web_md', 'en_core_web_sm', 'en']
model = None
for model_option in model_options:
if not model:
try:
spacy.info(model_option)
model = model_option
print("Loaded model", model_option)
except:
print("Could not detect model", model_option)
if not model:
print("Could not detect any suitable English model")
return

nlp = spacy.load(model)
print("🌋 Parsing utterances and filling docs")
print("🌋 Parsing utterances and filling docs with use_gold_mentions=" + (str(bool(self.gold_mentions))))
doc_iter = (s for s in self.utts_text)
for utt_tuple in tqdm(zip(nlp.pipe(doc_iter),
self.utts_tokens, self.utts_corefs,
Expand All @@ -615,13 +623,13 @@ def read_corpus(self, data_path, debug=False):
" speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id)
print(out_str.encode('utf-8'))
self.docs[doc_id].add_conll_utterance(doc, conll_tokens, corefs, speaker,
use_gold_mentions=self.use_gold_mentions)
use_gold_mentions=self.gold_mentions)

def build_and_gather_multiple_arrays(self, save_path):
print("🌋 Extracting mentions features")
print("🌋 Extracting mentions features with {} job(s)".format(self.n_jobs))
parallel_process(self.docs, set_feats, n_jobs=self.n_jobs)

print("🌋 Building and gathering arrays")
print("🌋 Building and gathering array with {} job(s)".format(self.n_jobs))
arr =[{'doc': doc,
'i': i} for i, doc in enumerate(self.docs)]
arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs)
Expand Down Expand Up @@ -652,20 +660,26 @@ def build_and_gather_multiple_arrays(self, save_path):
n_mentions_list.append(n)

for feature in FEATURES_NAMES[:9]:
print("Building numpy array for", feature, "length", len(gathering_dict[feature]))
feature_data = gathering_dict[feature]
if not feature_data:
print("No data for", feature)
continue
print("Building numpy array for", feature, "length", len(feature_data))
if feature != "mentions_spans":
array = np.array(gathering_dict[feature])
array = np.array(feature_data)
if array.ndim == 1:
array = np.expand_dims(array, axis=1)
else:
array = np.stack(gathering_dict[feature])
array = np.stack(feature_data)
# check_numpy_array(feature, array, n_mentions_list)
print("Saving numpy", feature, "size", array.shape)
np.save(save_path + feature, array)
for feature in FEATURES_NAMES[9:]:
print("Saving pickle", feature, "size", len(gathering_dict[feature]))
with open(save_path + feature + '.bin', "wb") as fp:
pickle.dump(gathering_dict[feature], fp)
feature_data = gathering_dict[feature]
if feature_data:
print("Saving pickle", feature, "size", len(feature_data))
with open(save_path + feature + '.bin', "wb") as fp:
pickle.dump(feature_data, fp)

def save_vocabulary(self, save_path, debug=False):
def _vocabulary_to_file(path, vocabulary):
Expand Down Expand Up @@ -706,10 +720,12 @@ def _vocabulary_to_file(path, vocabulary):
parser.add_argument('--path', type=str, default=DIR_PATH + '/data/', help='Path to the dataset')
parser.add_argument('--key', type=str, help='Path to an optional key file for scoring')
parser.add_argument('--n_jobs', type=int, default=1, help='Number of parallel jobs (default 1)')
parser.add_argument('--gold_mentions', type=int, default=0, help='Use gold mentions (1) or not (0, default)')
parser.add_argument('--blacklist', type=int, default=0, help='Use blacklist (1) or not (0, default)')
args = parser.parse_args()
if args.key is None:
args.key = args.path + "/key.txt"
CORPUS = ConllCorpus(n_jobs=args.n_jobs)
CORPUS = ConllCorpus(n_jobs=args.n_jobs, gold_mentions=args.gold_mentions, blacklist=args.blacklist)
if args.function == 'parse' or args.function == 'all':
SAVE_DIR = args.path + "/numpy/"
if not os.path.exists(SAVE_DIR):
Expand All @@ -724,13 +740,16 @@ def _vocabulary_to_file(path, vocabulary):
start_time = time.time()
CORPUS.read_corpus(args.path)
print('=> read_corpus time elapsed', time.time() - start_time)
start_time2 = time.time()
CORPUS.build_and_gather_multiple_arrays(SAVE_DIR)
print('=> build_and_gather_multiple_arrays time elapsed', time.time() - start_time2)
start_time2 = time.time()
CORPUS.save_vocabulary(SAVE_DIR)
print('=> save_vocabulary time elapsed', time.time() - start_time2)
print('=> total time elapsed', time.time() - start_time)
if not CORPUS.docs:
print("Could not parse any valid docs")
else:
start_time2 = time.time()
CORPUS.build_and_gather_multiple_arrays(SAVE_DIR)
print('=> build_and_gather_multiple_arrays time elapsed', time.time() - start_time2)
start_time2 = time.time()
CORPUS.save_vocabulary(SAVE_DIR)
print('=> save_vocabulary time elapsed', time.time() - start_time2)
print('=> total time elapsed', time.time() - start_time)
if args.function == 'key' or args.function == 'all':
CORPUS.build_key_file(args.path, args.key)
if args.function == 'find_undetected':
Expand Down
24 changes: 12 additions & 12 deletions neuralcoref/train/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
from torch.utils.data.sampler import Sampler
from torch.utils.data import Dataset

from neuralcoref.utils import (encode_distance, BATCH_SIZE_PATH, SIZE_FP,
from neuralcoref.train.utils import (encode_distance, BATCH_SIZE_PATH, SIZE_FP,
SIZE_FP_COMPRESSED, SIZE_FS, SIZE_FS_COMPRESSED,
SIZE_GENRE, SIZE_PAIR_IN, SIZE_SINGLE_IN)
from neuralcoref.conllparser import FEATURES_NAMES
from neuralcoref.train.conllparser import FEATURES_NAMES

def load_embeddings_from_file(name):
print("loading", name+"_embeddings.npy")
Expand Down Expand Up @@ -95,8 +95,8 @@ def __getitem__(self, mention_idx, debug=False):
"""
features_raw, label, pairs_length, pairs_start_index, spans, words = self.mentions[mention_idx]
pairs_start_index = np.asscalar(pairs_start_index)
pairs_length = np.asscalar(pairs_length)
pairs_start_index = pairs_start_index.item()
pairs_length = pairs_length.item()

# Build features array (float) from raw features (int)
assert features_raw.shape[0] == SIZE_FS_COMPRESSED
Expand Down Expand Up @@ -141,7 +141,7 @@ def __getitem__(self, mention_idx, debug=False):
pairs_features[:, 17:28] = encode_distance(pairs_features_raw[:, 7])
pairs_features[:, 28] = pairs_features_raw[:, 8]
# prepare antecent features
ant_features_raw = np.concatenate([self.mentions[np.asscalar(idx)][0][np.newaxis, :] for idx in pairs_ant_index])
ant_features_raw = np.concatenate([self.mentions[idx.item()][0][np.newaxis, :] for idx in pairs_ant_index])
ant_features = np.zeros((pairs_length, SIZE_FS-SIZE_GENRE))
ant_features[:, ant_features_raw[:, 0]] = 1
ant_features[:, 4:15] = encode_distance(ant_features_raw[:, 1])
Expand All @@ -152,8 +152,8 @@ def __getitem__(self, mention_idx, debug=False):
ana_features = np.tile(features, (pairs_length, 1))
pairs_features[:, 46:] = ana_features

ant_spans = np.concatenate([self.mentions[np.asscalar(idx)][4][np.newaxis, :] for idx in pairs_ant_index])
ant_words = np.concatenate([self.mentions[np.asscalar(idx)][5][np.newaxis, :] for idx in pairs_ant_index])
ant_spans = np.concatenate([self.mentions[idx.item()][4][np.newaxis, :] for idx in pairs_ant_index])
ant_words = np.concatenate([self.mentions[idx.item()][5][np.newaxis, :] for idx in pairs_ant_index])
ana_spans = np.tile(spans, (pairs_length, 1))
ana_words = np.tile(words, (pairs_length, 1))
ant_spans = torch.from_numpy(ant_spans).float()
Expand Down Expand Up @@ -328,15 +328,15 @@ def padder_collate(batch, debug=False):
# Remark this mask is the inverse of the weights in the above target (used for evaluation masking)
t_base = transposed_inputs[3]
out_targets = torch.stack(
[torch.cat([t.new(len(t)-1).zero_().byte(),
t.new(max_pairs + 1 - len(t)).fill_(1).byte(),
t.new(1).zero_().byte()]) if len(t) != max_pairs + 1 \
else t.new(max_pairs + 1).zero_().byte() for t in t_base], 0)
[torch.cat([t.new(len(t)-1).zero_().bool(),
t.new(max_pairs + 1 - len(t)).fill_(1).bool(),
t.new(1).zero_().bool()]) if len(t) != max_pairs + 1 \
else t.new(max_pairs + 1).zero_().bool() for t in t_base], 0)
else:
out_inputs = [torch.stack(t_inp, 0) for t_inp in transposed_inputs]
if transposed_targets is not None:
out_targets = [torch.stack(t_targ, 0) for t_targ in transposed_targets]
out_targets.append(out_targets[1].new(len(out_targets[1]), 1).fill_(1))
else:
out_targets = out_inputs[0].new(len(out_inputs[0]), 1).zero_().byte()
out_targets = out_inputs[0].new(len(out_inputs[0]), 1).zero_().bool()
return (out_inputs, out_targets)
7 changes: 4 additions & 3 deletions neuralcoref/train/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import re
import io
from six import string_types, integer_types
from spacy.tokens import Span, Token

from neuralcoref.compat import unicode_
from neuralcoref.utils import encode_distance, parallel_process
from neuralcoref.train.compat import unicode_
from neuralcoref.train.utils import encode_distance, parallel_process

try:
from itertools import izip_longest as zip_longest
Expand Down Expand Up @@ -39,7 +40,7 @@
## MENTION EXTRACTION ###
#########################

def extract_mentions_spans(doc, blacklist=True, debug=False):
def extract_mentions_spans(doc, blacklist, debug=False):
'''
Extract potential mentions from a spacy parsed Doc
'''
Expand Down
Loading

0 comments on commit 70cab1d

Please sign in to comment.