In [1]:
import os
import importlib
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import json

import prompts
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = 'llama'
DATA = 'acl_arc' #acl_arc, finecite, multicite
SCHEMA = 'XML' # XML, JSON
DATA_SIZE = None # 1000


INPUT = f'./data/{DATA}/{SCHEMA}/'
OUTPUT = f'./output/{DATA}/{MODEL}/{SCHEMA}/'
if DATA_SIZE:
    INPUT = f'./data/{DATA}/{DATA_SIZE}/{SCHEMA}/'
    OUTPUT = f'./output/{DATA}/{MODEL}/{DATA_SIZE}/{SCHEMA}/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_mapping = {
    "mistral":'mistralai/Mistral-7B-Instruct-v0.3',
    "scitulu":'allenai/scitulu-7b',
    'llama':'meta-llama/Meta-Llama-3.1-8B-Instruct',
}
model_id = model_mapping[MODEL]

max_seq_length = 1024

#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
import importlib
importlib.reload(evaluate)

# #get prompt len
# for DATA in ['acl_arc', 'finecite', 'multicite']:
#     for SCHEMA in ['XML', 'JSON']:
#         prompt = prompts.PromptForAutoCCA(tokenizer, DATA, SCHEMA)
#         print(len(prompt.create_sample('','',return_prompt_only=True)))
            

#get len of tokenized train_data
# max_len = 0
# for ids in train_ds['input_ids']:
#     if len(ids) > max_len: max_len = len(ids)  
# print(max_len)

<module 'evaluate' from '/home/explorer/Automatic_CCE/evaluate.py'>

In [4]:
#model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

LMmodel = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map ='auto'
)
LMmodel.resize_token_embeddings(len(tokenizer))

peft_config = LoraConfig(
        target_modules=[ "v_proj", "q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj" ], 
        inference_mode=False, 
        r=4, 
        lora_alpha=32, 
        lora_dropout=0.1,    
        task_type="CAUSAL_LM",        
    )

LMmodel = get_peft_model(LMmodel, peft_config)

LMmodel.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.62s/it]


trainable params: 10,485,760 || all params: 8,040,747,008 || trainable%: 0.1304


In [5]:
# load data
with open(INPUT + 'train.json', 'r') as file:
    train_data = json.load(file)
    
with open(INPUT + 'test.json', 'r') as file:
    test_data = json.load(file)

# Convert the DataFrame to a Dataset
train_ds = Dataset.from_list(train_data)
test_ds = Dataset.from_list(test_data[:int(len(test_data)/2)])

# initialize prompt class

prompt = prompts.PromptForAutoCCA(tokenizer, MODEL, DATA, SCHEMA)

#Apply the tokenization function to the dataset
train_ds = train_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=train_ds.column_names
)

dev_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=test_ds.column_names
)

eval_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output'], for_generation=True), 
    batched=False, 
)


Map: 100%|██████████| 7382/7382 [00:08<00:00, 857.38 examples/s]
Map: 100%|██████████| 867/867 [00:01<00:00, 856.56 examples/s]
Map: 100%|██████████| 867/867 [00:00<00:00, 1087.42 examples/s]


In [6]:
# max_len = 0
# for ids in train_ds['input_ids']:
#     if len(ids) > max_len: max_len = len(ids)
# max_len

In [7]:
# Define Data Collator
class CustomDataCollator:
    def __init__(self, tokenizer, model_id, padding, max_length):
        self.tokenizer = tokenizer
        self.model_id = model_id
        self.padding = padding
        self.max_length = max_length

    def __call__(self, features):
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            return_tensors='pt',
        )
        labels = batch["input_ids"].clone()
        
        # Compute loss mask for output token only
        for i in range(batch['input_ids'].shape[0]):
            
            # Decode whole sample
            text_content = self.tokenizer.decode(batch['input_ids'][i][1:])  
            
            # Extract output boundary
            if self.model_id == 'mistral':
                output_boundary = text_content.rfind("[/INST]") + len("[/INST]")
            if self.model_id == 'llama':
                output_boundary = text_content.rfind("<|end_header_id|>") + len("<|end_header_id|>")
            prompt_text = text_content[:output_boundary]
            
            # tokenize promt text
            prompt_text_tokenized = self.tokenizer(
                prompt_text,
                return_overflowing_tokens=False,
                return_length=False,
            )
            # get length of promt text
            promt_text_len = len(prompt_text_tokenized['input_ids'])
            
            # set loss mask for promt text
            labels[i][range(promt_text_len)] = -100
            
                    
        batch["labels"] = labels
        return batch

# init data collator
data_collator=CustomDataCollator(
    tokenizer=tokenizer, 
    model_id = MODEL,
    padding="longest", 
    max_length=max_seq_length, 
)

In [8]:
# from sklearn.metrics import (accuracy_score, 
#                              recall_score, 
#                              precision_score, 
#                              f1_score)

# def compute_metrics(p):    
#     pred, labels = p
#     pred = np.argmax(pred, axis=1)
#     accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred)
#     precision = precision_score(y_true=labels, y_pred=pred)
#     f1 = f1_score(y_true=labels, y_pred=pred)    
#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [9]:
# hyper parameterh
params =  {
    'lr': 1e-5
}

# load trainer
training_arguments = TrainingArguments(
    output_dir=OUTPUT,
    eval_strategy = 'steps',
    eval_steps= 500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=params['lr'],
    #num_train_epochs = 9,
    max_steps=len(train_data) * 3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps = 100,
    save_strategy = 'steps',
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    label_names = ['labels'],
)
trainer = Trainer(
    model=LMmodel,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    args=training_arguments,
    data_collator=data_collator,
    callbacks= [EarlyStoppingCallback(early_stopping_patience=2)]
)

# par? 2500 steps wiht 1e-04? SFTTrainer? arr as answer

In [10]:
trainer.train()

{'loss': 0.1823, 'grad_norm': 2.4405770301818848, 'learning_rate': 4.5146726862302486e-07, 'epoch': 0.0135464643727987}
{'loss': 0.17, 'grad_norm': 3.369896411895752, 'learning_rate': 9.029345372460497e-07, 'epoch': 0.0270929287455974}
{'loss': 0.1161, 'grad_norm': 3.758136510848999, 'learning_rate': 1.3544018058690746e-06, 'epoch': 0.0406393931183961}
{'loss': 0.0645, 'grad_norm': 3.7456984519958496, 'learning_rate': 1.8058690744920994e-06, 'epoch': 0.0541858574911948}
{'loss': 0.0473, 'grad_norm': 3.3516476154327393, 'learning_rate': 2.2573363431151243e-06, 'epoch': 0.0677323218639935}
{'eval_loss': 0.04754631593823433, 'eval_runtime': 252.7716, 'eval_samples_per_second': 3.43, 'eval_steps_per_second': 3.43, 'epoch': 0.0677323218639935}
{'loss': 0.038, 'grad_norm': 1.7266865968704224, 'learning_rate': 2.7088036117381493e-06, 'epoch': 0.0812787862367922}
{'loss': 0.0352, 'grad_norm': 3.158332586288452, 'learning_rate': 3.1602708803611743e-06, 'epoch': 0.09482525060959089}
{'loss': 0.0

KeyboardInterrupt: 

In [11]:
import importlib
importlib.reload(evaluate)

evaluator = evaluate.Evaluator(trainer.model, MODEL, tokenizer, eval_ds, DATA, SCHEMA, DEVICE)
results = evaluator.evaluate() #test_data=eval_ds['output'])

results['params'] = params
results['notes'] = ''

with open(OUTPUT + 'eval_results.jsonl', 'a') as fp:
    json.dump(results, fp)

  1%|          | 1/150 [00:11<28:58, 11.67s/it]

This paper considers the reading comprehension task in which some discrete-reasoning abilities are needed to correctly answer questions. <CONT> Specifically, we focus on a new reading comprehension dataset called DROP #TARGET_REF<USE/>, which requires Discrete Reasoning Over the content of Paragraphs to obtain the final answer. </CONT> Unlike previous benchmarks such as CNN/DM (#REF) and SQuAD (#REF) that have been well solved #REF), DROP is substantially more challenging in three ways. First, the answers to 1 https://github.com/huminghao16/MTMSN the questions involve a wide range of types such as numbers, dates, or text strings. Therefore, various kinds of prediction strategies are required to successfully find the answers.


  1%|▏         | 2/150 [00:24<29:53, 12.12s/it]

Based on the representations, we employ a multi-type answer predictor that is able to produce four answer types: (1) span from the text; (2) arithmetic expression; (3) count number; (4) negation on numbers ( §3.2). <CONT> Following #TARGET_REF<USE/>, we first predict the answer type of a given passage-question pair, and then adopt individual prediction strategies. </CONT> To support multispan extraction ( §3.3), the model explicitly predicts the number of answer spans. It then outputs non-overlapped spans until the specific amount is reached. Moreover, we do not directly use the arithmetic expression that possesses the maximum probability, but instead re-rank several expression candidates that are decoded by beam search to further confirm the prediction ( §3.4).


  2%|▏         | 3/150 [00:38<31:56, 13.04s/it]

Rather than restricting the answer to always be a span of text, the discrete-reasoning reading comprehension task involves different answer types (e.g., number, date, span of text). <CONT> Following #TARGET_REF<USE/>, we design a multi-type answer predictor to selectively produce different kinds of answers such as span, count number, and arithmetic expression. </CONT> To further increase answer coverage, we propose adding a new answer type to support logical negation. Moreover, unlike prior work that separately predicts passage spans and question spans, our approach directly extracts spans from the input sequence. Answer type prediction Inspired by the Augmented QANet model (#REF), we use the contextualized token representations from the last four blocks (H L−3,..., H L ) as the inputs to our answer predictor, which are denoted as M 0, M 1, M 2, M 3, respectively.


  3%|▎         | 4/150 [00:51<32:16, 13.26s/it]

Moreover, unlike prior work that separately predicts passage spans and question spans, our approach directly extracts spans from the input sequence. <CONT> Answer type prediction Inspired by the Augmented QANet model #TARGET_REF<USE/>, we use the contextualized token representations from the last four blocks (H L−3,..., H L ) as the inputs to our answer predictor, which are denoted as M 0, M 1, M 2, M 3, respectively. </CONT> To predict the answer type, we first split the representation M 2 into a question representation Q 2 and a passage representation P 2 according to the index of intermediate [SEP] token. Then the model computes two vectors h Q 2 and h P 2 that summarize the question and passage information respectively: where h P 2 is computed in a similar way over P 2.


  3%|▎         | 5/150 [01:01<29:14, 12.10s/it]

Since DROP does not indicate the answer type but only provides the answer string, we therefore adopt the weakly supervised annotation scheme, as suggested in #REF; #TARGET_REF<USE/>. We find all possible annotations that point to the gold answer, including matching spans, arithmetic expressions, correct count numbers, negation operations, and the number of spans. We use simple rules to search over all mentioned numbers to find potential negations. That is, if 100 minus a number is equal to the answer, then a negation occurs on this number. Besides, we only search the addition/subtraction of three numbers at most due to the exponential search space.


  4%|▍         | 6/150 [01:14<29:05, 12.12s/it]

<CONT> Dataset We consider the reading comprehension benchmark that requires Discrete Reasoning Over Paragraphs (DROP) #TARGET_REF<USE/> prehensive understanding of the context as well as the ability of numerical reasoning are required. </CONT> Model settings We build our model upon two publicly available uncased versions of BERT: BERT BASE and BERT LARGE 2, and refer readers to #REF for details on model sizes. We use Adam optimizer with a learning rate of 3e-5 and warmup over the first 5% steps to train. The maximum number of epochs is set to 10 for base models and 5 for large models, while the batch size is 12 or 24 respectively. A dropout probability of 0.1 is used unless stated otherwise.


  5%|▍         | 7/150 [01:25<28:08, 11.81s/it]

Despite their success, these datasets only require shallow pattern matching and simple logical reasoning, thus being well solved #REF). <CONT> Recently, #TARGET_REF<BACKGROUND/> released a new benchmark named DROP that demands discrete reasoning as well as deeper paragraph understanding to find the answers. </CONT> #REF introduced a dataset consisting of different types of mathematics problems to focuses on mathematical computation. We choose to work on DROP to test both the numerical reasoning and linguistic comprehension abilities. Neural reading models Previous neural reading models, such as BiDAF (#REF), R-Net (#REF), QANet (#REF), Reinforced Mreader (#REF), are usually designed to extract a continuous span of text as the answer.


  5%|▌         | 8/150 [01:38<29:06, 12.30s/it]

where ⊗ denotes the outer product between the vector g and each token representation in M. <CONT> Arithmetic expression In order to model the process of performing addition or subtraction among multiple numbers mentioned in the passage, we assign a three-way categorical variable (plus, minus, or zero) for each number to indicate its sign, similar to #TARGET_REF<SIMILARITY/><USE/>. </CONT> As a result, an arithmetic expression that has a number as the final answer can be obtained and easily evaluated. Specifically, for each number mentioned in the passage, we gather its corresponding representation from the concatenation of M 2 and M 3, eventually yielding U = (u 1,..., u N ) ∈ R N ×2 * D where N numbers exist. Then the probabilities of the i-th number being assigned a plus, minus or zero is computed as:


  6%|▌         | 9/150 [01:45<25:17, 10.77s/it]

<CONT> Baselines Following the implementation of Augmented QANet (NAQANet) #TARGET_REF<EXTENDS/>, we introduce a similar baseline called Augmented BERT (NABERT). </CONT> The main difference is that we replace the encoder of QANet (#REF) with the pre-trained Transformer blocks (#REF). Moreover, it also supports the prediction of various answer types such as span, arithmetic expression, and count number.


  7%|▋         | 10/150 [01:55<24:18, 10.42s/it]

Polysynthetic languages pose a challenge for morphological analysis due to the rootmorpheme complexity and to the word class "squish". In addition, many of these polysynthetic languages are low-resource. <CONT> We propose unsupervised approaches for morphological segmentation of low-resource polysynthetic languages based on Adaptor Grammars (AG) #TARGET_REF<USE/>. </CONT> We experiment with four languages from the Uto-Aztecan family. Our AG-based approaches outperform other unsupervised approaches and show promise when compared to supervised methods, outperforming them on two of the four languages.


  7%|▋         | 11/150 [02:17<32:18, 13.95s/it]

Adaptor Grammars (AGs) are nonparametric Bayesian models that generalize probabilistic context free grammars (PCFG), and have proven to be successful for unsupervised morphological segmentation, where a PCFG is a morphological grammar that specifies word structure (#REF; #REF; #REF #REF. Our main goal is to examine the success of Adaptor Grammars for unsupervised morphological segmentation when applied to polysynthetic languages, where the morphology is synthetically complex (not simply agglutinative), and where resources are minimal. We use the datasets introduced by #REF in an unsupervised fashion (unsegmented words). <CONT> We design several AG learning setups: 1) use the best-on-average AG setup from #TARGET_REF<USE/> ; 2) optimize for language using just the small training vocabulary (unsegmented) and dev vocabulary (segmented) from #REF ; 3) approximate the effect of having some linguistic knowledge; 4) learn from all languages at once and 5) add additional unsupervised data for 

  8%|▊         | 12/150 [02:32<32:31, 14.14s/it]

An Adaptor Grammar is typically composed of a PCFG and an adaptor that adapts the probabilities of individual subtrees. For morphological segmentation, a PCFG is a morphological grammar that specifies word structure, where AGs learn latent tree structures given a list of words. <CONT> In this paper, we experiment with the grammars and the learning setups proposed by #TARGET_REF<USE/>, which we outline briefly below. </CONT> Grammars. We use the nine grammars from Eskander et al. (2016 Eskander et al. (, 2018 that were designed based on three dimensions: 1) how the grammar models word structure (e.g., prefix-stem-suffix vs. morphemes), 2) the level of abstraction in nonterminals (e.g., compounds, morphemes and submorphemes) and 3) how the output boundaries are specified (see Table 2 for a sample grammars).


  9%|▊         | 13/150 [02:47<32:59, 14.45s/it]

In this paper, we experiment with the grammars and the learning setups proposed by #REF, which we outline briefly below. Grammars. <CONT> We use the nine grammars from #TARGET_REF<USE/> Eskander et al. (, 2018 that were designed based on three dimensions: 1) how the grammar models word structure (e.g., prefix-stem-suffix vs. morphemes), 2) the level of abstraction in nonterminals (e.g., compounds, morphemes and submorphemes) and 3) how the output boundaries are specified (see Table 2 for a sample grammars). </CONT> For example, the PrStSu+SM grammar models the Table 2 : Sample grammar setups used by Eskander et al. (2018 Eskander et al. (, 2016. Compound = Upper level representation of the word as a sequence of compounds; Morph = affix/morpheme representation as a sequence of morphemes.


  9%|▉         | 14/150 [03:05<35:08, 15.51s/it]

We use the nine grammars from Eskander et al. (2016 Eskander et al. (, 2018 that were designed based on three dimensions: 1) how the grammar models word structure (e.g., prefix-stem-suffix vs. morphemes), 2) the level of abstraction in nonterminals (e.g., compounds, morphemes and submorphemes) and 3) how the output boundaries are specified (see Table 2 for a sample grammars). <CONT> For example, the PrStSu+SM grammar models the Table 2 : Sample grammar setups used by Eskander et al. (2018 #TARGET_REF<USE/>. </CONT> Compound = Upper level representation of the word as a sequence of compounds; Morph = affix/morpheme representation as a sequence of morphemes. SubMorph (SM) = Lower level representation of characters as a sequence of sub-morphemes. "+" denotes one or more. word as a complex prefix, a stem and a complex suffix, where the complex prefix and suffix are composed of zero or more morphemes, and a morpheme is a sequence of sub-morphemes.


 10%|█         | 15/150 [03:16<31:54, 14.18s/it]

Learning Settings. The input to the learner is a grammar and a vocabulary of unsegmented words. <CONT> We consider the three learning settings in #TARGET_REF<USE/> : Standard, Scholarseeded Knowledge and Cascaded. </CONT> The Standard setting is language-independent and fully unsupervised, while in the Scholar-seeded-Knowledge setting, some linguistic knowledge (in the form of affixes taken from grammar books) is seeded into the grammar trees before learning takes place. The Cascaded setting simulates the effect of seeding scholar knowledge in a language-independent manner by first running an AG of high precision to derive a set of affixes, and then seeding those affixes into the grammars.


 11%|█         | 16/150 [03:24<27:20, 12.24s/it]

We experimented with several setups using AGs for unsupervised segmentation. Language-Independent Morphological Segmenter. <CONT> LIMS is the best-on-average AG setup obtained by #TARGET_REF<USE/> when trained on six languages (English, German, Finnish, Estonian, Turkish and Zulu), which is the Cascaded PrStSu+SM configuration. </CONT> We use this AG setup for each of the four languages. We refer to this system as AG LIM S.


 11%|█▏        | 17/150 [03:33<25:12, 11.37s/it]

Best AG Configuration per Language. <CONT> In this experimental setup, we consider all nine grammars from #TARGET_REF<USE/> using both the Standard and the Cascaded approaches and choosing the one that is best for each polysynthetic language by training on the training set and evaluating on the development set. </CONT> We denote this system as AG BestL. Using Seeded Knowledge. To approximate the effect of Scholar-seeded-Knowledge in #REF, we used the training set to derive affixes and use them as scholar-seeded knowledge added to the grammars (before the learning happens).


 12%|█▏        | 18/150 [03:40<22:26, 10.20s/it]

Unsupervised approaches based on Adaptor Grammars show promise for morphological segmentation of low-resource polysynthetic languages. <CONT> We worked with the AG grammars developed by #TARGET_REF<USE/> Eskander et al. (, 2018 for languages that are not polysynthetic. </CONT> We showed that even when using these approaches and very little data, we can obtain encouraging results, and that using additional unsupervised data is a promising path.


 13%|█▎        | 19/150 [03:49<21:11,  9.70s/it]

We denote this system as AG BestL. Using Seeded Knowledge. <CONT> To approximate the effect of Scholar-seeded-Knowledge in #TARGET_REF<USE/>, we used the training set to derive affixes and use them as scholar-seeded knowledge added to the grammars (before the learning happens). </CONT> However, since affixes and stems are not distinguished in the training annotations from #REF, we only consider the first and last morphemes that appear at least five times. We call this setup AG Scholar BestL.


 13%|█▎        | 20/150 [04:09<27:58, 12.91s/it]

We experiment with four UtoAztecan languages: Mexicanero (MX), Nahuatl (NH), Wixarika (WX) and Yorem Nokki (YN) (#REF). <CONT> Adaptor Grammars (AGs) are nonparametric Bayesian models that generalize probabilistic context free grammars (PCFG), and have proven to be successful for unsupervised morphological segmentation, where a PCFG is a morphological grammar that specifies word structure (#REF; #REF; #TARGET_REF<BACKGROUND/> #REF. Our main goal is to examine the success of Adaptor Grammars for unsupervised morphological segmentation when applied to polysynthetic languages, where the morphology is synthetically complex (not simply agglutinative), and where resources are minimal. </CONT> We use the datasets introduced by #REF in an unsupervised fashion (unsegmented words). We design several AG learning setups: 1) use the best-on-average AG setup from #REF ; 2) optimize for language using just the small training vocabulary (unsegmented) and dev vocabulary (segmented) from #REF ; 3) appro

 14%|█▍        | 21/150 [04:19<25:21, 11.79s/it]

Evaluating different AG setups. Table 3 shows the performance of our AG setups on the four languages. The best AG setup learned for each of the four polysynthetic languages (AG BestL ) is the PrStSu+SM grammar using the Cascaded learning setup. <CONT> This is an interesting finding as the Cascaded PrSTSu+SM setup is in fact AG LIM S -the best-on-average AG setup obtained by #TARGET_REF<DIFFERENCE/> Table 4 : Best AG results compared to supervised approaches from #REF. </CONT> Bold indicates best scores.


 15%|█▍        | 22/150 [04:38<29:46, 13.95s/it]

However, there has been growing concern about potential biases in learning systems [1], [6] which can be difficult to analyse or query for explanations of their predictions, leading to an increasing number of studies investigating the way blackbox systems represent knowledge and make decisions [7], [9], [11], [19], [20]. Indeed, principled methods are now required that allow us to measure, understand and remove biases in our data in order for these systems to be truly accepted as a prominent part of our lives. In the domain of text, many modern approaches often begin by embedding the input text data into an embedding space that is used as the first layer in a subsequent deep network [4], [14]. <CONT> These word embeddings have been shown to contain the same biases #TARGET_REF<BACKGROUND/>, due to the source data from which they are trained. </CONT> In effect, biases from the source data, such as in the differences in representation for men and women, that have been found in many differ

 15%|█▌        | 23/150 [04:54<31:21, 14.82s/it]

In this paper, we make three contributions towards addressing these concerns. <CONT> First we propose a new version of the Word Embedding Association Tests (WEATs) studied in #TARGET_REF<EXTENDS/>, designed to demonstrate and quantify bias in word embeddings, which puts them on a firm foundation by using the Linguistic Inquiry and Word Count (LIWC) lexica [17] to systematically detect and measure embedding biases. </CONT> With this improved experimental setting, we find that European-American names are viewed more positively than African-American names, male names are more associated with work while female names are more associated with family, and that the academic disciplines of science and maths are more associated with male terms than the arts, which are more associated with female terms. Using this new methodology, we then find that there is a gender bias in the way different occupations are represented by the embedding. Furthermore, we use the latest official employment statistic

 16%|█▌        | 24/150 [05:06<28:50, 13.73s/it]

In this paper, we conduct three experiments on semantic word embeddings. <CONT> We first propose a new version of the Word Embedding Association Tests studied in #TARGET_REF<EXTENDS/> by using the LIWC lexica to systematically detect and measure the biases within the embedding, keeping the tests comparable with the same set of target words. </CONT> We further extend this work using additional sets of target words, and compare sentiment across male and female names. Furthermore, we investigate gender bias in words that represent different occupations, comparing these associations with UK national employment statistics. In the last experiment, we use orthogonal projections [2] to debias our word embeddings, and measure the reduction in the biases demonstrated in the previous two experiments.


 17%|█▋        | 25/150 [05:17<27:04, 13.00s/it]

Using the list of occupations from the previous section, we compared their association with each of the genders with the ratio of the actual number of men and women working in those roles, as recorded in the official statistics [15], where 1 indicates only men work in this role, and 0 only women. We found that there is a strong, significant correlation (ρ = 0.57, p-value < 10 −6 ) between the word embedding association between gender and occupation and the number of people of each gender in the United Kingdom working in those roles. <CONT> This supports a similar finding for U.S. employment statistics using an independent set of occupations found in #TARGET_REF<SIMILARITY/>. </CONT>


 17%|█▋        | 26/150 [05:35<29:48, 14.42s/it]

<CONT> In this paper, we have introduced the LIWC-WEAT, a set of objective tests extending the association tests in #TARGET_REF<EXTENDS/> by using the LIWC lexica to measure bias within word embeddings. </CONT> We found bias in both the associations of gender and race, as first described in [3], while additionally finding that male names have a slightly higher positive association than female names. Biases found in the embedding were also shown to reflect biases in the real world and the media, where we found a correlation between the number of men and women in an occupation and its association with each set of male and female names. Finally, using a projection algorithm [2], we were able to reduce the gender bias shown in the embeddings, resulting in a decrease in the difference between associations for all tests based upon gender. Further work in this direction will include removing bias in n-gram embeddings, embeddings that include multiple languages and new procedures for both gene

 18%|█▊        | 27/150 [05:52<31:30, 15.37s/it]

In this paper, we have introduced the LIWC-WEAT, a set of objective tests extending the association tests in [3] by using the LIWC lexica to measure bias within word embeddings. <CONT> We found bias in both the associations of gender and race, as first described in #TARGET_REF<USE/>, while additionally finding that male names have a slightly higher positive association than female names. </CONT> Biases found in the embedding were also shown to reflect biases in the real world and the media, where we found a correlation between the number of men and women in an occupation and its association with each set of male and female names. Finally, using a projection algorithm [2], we were able to reduce the gender bias shown in the embeddings, resulting in a decrease in the difference between associations for all tests based upon gender. Further work in this direction will include removing bias in n-gram embeddings, embeddings that include multiple languages and new procedures for both generati

 19%|█▊        | 28/150 [06:04<29:04, 14.30s/it]

<CONT> Taking the list of target European-American and African-American names used in #TARGET_REF<USE/>, we tested each of them for their associated with the positive and negative emotion concepts found in [17] by using the methodology described by Eq. 3 in Sec. II-B, replacing the short list of words used to originally represent pleasant and unpleasant attribute sets. </CONT> Our test found that while both European-American names and African-American names are more associated with positive emotions than negative emotions, the test showed that European-American names are more associated with positive emotions than their African-American counterparts, as shown in Fig. 1a. This finding supports the association test in [3], where they also found that European-American names were more pleasant than African-American names.


 19%|█▉        | 29/150 [06:21<30:36, 15.18s/it]

The results of our test again support the findings of [3], with Maths and Science terms being more closely associated with males, while Arts terms are more closely associated with females, as shown in Fig. 1b. <CONT> 3) Association of Gender with Career and Family : Taking the list of target gendered names used in #TARGET_REF<USE/>, we tested each of them for their associated with the career and family concepts using the categories of 'work' and 'family' found in LIWC [17]. </CONT> As shown in Fig. 1c, we found that the set of male names was more associated with the concept of work, while the female names were more associated with family, mirroring the results found in [3]. Extending this test, we generated a much larger set of male and female target names from an online list of baby names 1. Repeating the same test on this larger set of names, we found that male and female names were much less separated than suggested by previous results, with only minor differences between the two, a

 20%|██        | 30/150 [06:33<28:32, 14.27s/it]

Taking the list of target European-American and African-American names used in [3], we tested each of them for their associated with the positive and negative emotion concepts found in [17] by using the methodology described by Eq. 3 in Sec. II-B, replacing the short list of words used to originally represent pleasant and unpleasant attribute sets. Our test found that while both European-American names and African-American names are more associated with positive emotions than negative emotions, the test showed that European-American names are more associated with positive emotions than their African-American counterparts, as shown in Fig. 1a. <CONT> This finding supports the association test in #TARGET_REF<SIMILARITY/><USE/>, where they also found that European-American names were more pleasant than African-American names. </CONT>


 21%|██        | 31/150 [06:51<30:17, 15.27s/it]

A further test was conducted to find the association between words related to different subject disciplines (e.g. arts, maths, science) with each of the genders using the 'he' and'she' categories from LIWC [17]. <CONT> The results of our test again support the findings of #TARGET_REF<SIMILARITY/>, with Maths and Science terms being more closely associated with males, while Arts terms are more closely associated with females, as shown in Fig. 1b. </CONT> 3) Association of Gender with Career and Family : Taking the list of target gendered names used in [3], we tested each of them for their associated with the career and family concepts using the categories of 'work' and 'family' found in LIWC [17]. As shown in Fig. 1c, we found that the set of male names was more associated with the concept of work, while the female names were more associated with family, mirroring the results found in [3]. Extending this test, we generated a much larger set of male and female target names from an online

 21%|██▏       | 32/150 [07:07<30:45, 15.64s/it]

In Experiment 1, we previously found that the disciplines of science and maths were more associated with male terms in the embedding, while the arts were closer to female terms. The association of each of these subject disciplines with gender after orthogonal projection was found to be more balanced, with closer to equal association for both male and female terms, shown in Fig. 3a. <CONT> Male and Females names tested in #TARGET_REF<USE/> showed a clear distinction in their association with work and family respectively, with our replication of the test in Sec. III-B3 finding the same results. </CONT> Performing the same tests again after applying the gender projection to both name lists, we wished to quantify the change in associations. We calculated the change in the distance between the centroids of each set of names before and after applying the orthogonal gender projection, finding that the association with work for males and family for females reduced, closing the gap between male

 22%|██▏       | 33/150 [07:20<28:46, 14.76s/it]

Moreover, some linguists suggested that more than one translation should be collected to capture deeper layers of meaning (#REF). This work is a contribution to the Computational Language Documentation (CLD) research field, that aims to replace part of the manual steps performed by linguists during language documentation initiatives by automatic approaches. <CONT> Here we investigate the unsupervised word discovery and segmentation task, using the bilingual-rooted approach from #TARGET_REF<USE/>. </CONT> There, words in the well-resourced language are aligned to unsegmented phonemes in the endangered language in order to identify group of phonemes, and to cluster them into word-like units. We experiment with the Mboshi-French parallel corpus, translating the French text into four other well-resourced languages in order to investigate language impact in this CLD approach.


 23%|██▎       | 34/150 [07:32<27:01, 13.98s/it]

1 The languages added to the dataset are: English, German, Portuguese and Spanish. Table 1 shows some statistics for the produced Multilingual Mboshi parallel corpus. <CONT> 2 Bilingual Unsupervised Word Segmentation/Discovery Approach: We use the bilingual neuralbased Unsupervised Word Segmentation (UWS) approach from #TARGET_REF<USE/> to discover words in Mboshi. </CONT> In this approach, Neural Machine Translation (NMT) models are trained between language pairs, using as source language the translation (word-level) and as target, the language to document (unsegmented phonemic sequence). Due to the attention mechanism present in these networks (#REF), posterior to training, it is possible to retrieve soft-alignment probability matrices between source and target sequences.


 23%|██▎       | 35/150 [07:42<24:31, 12.80s/it]

The product of this approach is a set of (discovered-units, translation words) pairs. <CONT> Multilingual Leveraging: In this work we apply two simple methods for including multilingual information into the bilingual models from #TARGET_REF<USE/>. </CONT> The first one, Multilingual Voting, consists of merging the information learned by models trained with different language pairs by performing a voting over the final discovered boundaries. The voting is performed by applying an agreement threshold T over the output boundaries. This threshold balances between accepting all boundaries from all the bilingual models (zero agreement) and accepting only input boundaries discovered by all these models (total agreement).


 24%|██▍       | 36/150 [07:55<24:01, 12.64s/it]

<CONT> Recent work on deep learning syntactic parsing models has achieved notably good results, e.g., #REF with 92.4 F 1 on Penn Treebank constituency parsing and #TARGET_REF<BACKGROUND/> with 92.8 F 1. </CONT> In this paper we borrow from the approaches of both of these works and present a neural-net parse reranker that achieves very good results, 93.8 F 1, with a comparatively simple architecture. In the remainder of this section we outline the major difference between this and previous workviewing parsing as a language modeling problem. Section 2 looks more closely at three of the most relevant previous papers. We then describe our exact model (Section 3), followed by the experimental setup and results (Sections 4 and 5).


 25%|██▍       | 37/150 [08:04<22:10, 11.77s/it]

A generative parsing model parses a sentence (x) into its phrasal structure (y) according to where Y(x) lists all possible structures of x. <CONT> If we think of a tree (x, y) as a sequence (z) #TARGET_REF<USE/> as illustrated in Figure 1, we can define a probability distribution over (x, y) as follows: which is equivalent to Equation (1). </CONT> We have reduced parsing to language modeling and can use language modeling techniques of estimating P (z t |z 1, · · ·, z t−1 ) for parsing.


 25%|██▌       | 38/150 [08:10<18:25,  9.87s/it]

We look here at three neural net (NN) models closest to our research along various dimensions. <CONT> The first (#REF) gives the basic language modeling architecture that we have adopted, while the other two #TARGET_REF<BACKGROUND/>; #REF) are parsing models that have the current best results in NN parsing. </CONT>


 26%|██▌       | 39/150 [08:26<21:37, 11.68s/it]

<CONT> We use the Wall Street Journal (WSJ) of the Penn Treebank (#REF) for training (2-21), development (24) and testing (23) and millions of auto-parsed "silver" trees (#REF; #REF; #TARGET_REF<USE/> for tritraining. </CONT> To obtain silver trees, we parse the entire section of the New York Times (NYT) of the fifth Gigaword (#REF ) with a product of eight Berkeley parsers (#REF) 2 and ZPar (#REF) and select 24 million trees on which both parsers agree (#REF). We do not resample trees to match the sentence length distribution of the NYT to that of the WSJ (Vinyals et 1 The code and trained models used for experiments are available at github.com/cdg720/emnlp2016. 2 We use the reimplementation by #REF. (#REF) performed better when trained on all of 24 million trees than when trained on resampled two million trees.


 27%|██▋       | 40/150 [08:37<20:58, 11.44s/it]

<CONT> We compare LSTM-LM (GS) to two very strong semi-supervised NN parsers: an ensemble of five MTPs trained on 11 million trees of the highconfidence corpus 4 (HC) #TARGET_REF<USE/> ; and an ensemble of six one-to-many sequence models trained on the HC and 4.5 millions of EnglishGerman translation sentence pairs (#REF). </CONT> We also compare LSTM-LM (GS) to best performing non-NN parsers in the literature. Parsers' parsing performance along with their training data is reported in Table 3. LSTM-LM (GS) outperforms all the other parsers with 93.1 F 1.


 27%|██▋       | 41/150 [08:46<19:26, 10.70s/it]

We also wish to develop a complete parsing model using the LSTM-LM framework. Table 3 : Evaluation of models trained on the WSJ and additional resources. <CONT> Note that the numbers of #TARGET_REF<BACKGROUND/> and #REF are not directly comparable as their models are evaluated on OntoNotesstyle trees instead of PTB-style trees. </CONT> E(LSTM-LMs (GS)) is an ensemble of eight LSTM-LMs (GS). X/Y in Silver column indicates the number of silver trees used to train Charniak parser and LSTM-LM.


 28%|██▊       | 42/150 [08:58<20:09, 11.20s/it]

Transforming tree representations for the purpose of parsing is not a new idea. <CONT> It has been done for constituency parsing for example by #REF but also for dependency parsing for example by #TARGET_REF<BACKGROUND/>. </CONT> #REF modified the representation of several constructions in several languages and obtained a consistent improvement in parsing accuracy. In this paper, we will investigate the case of the verb group construction and attempt to reproduce the study by #REF on UD treebanks to find out whether or not the alternative representation is useful for parsing with UD. have shown that modifying coordination constructions and verb groups from their representation in the Prague Dependency Treebank (henceforth PDT) to a representation described in Melčuk (1988) (Mel'čuk style, henceforth MS) improves dependency parsing for Czech.


 29%|██▊       | 43/150 [09:08<19:33, 10.97s/it]

For that reason, de #REF suggest that those representations could be modified for the purpose of parsing, thus creating a parsing representation. Transforming tree representations for the purpose of parsing is not a new idea. It has been done for constituency parsing for example by #REF but also for dependency parsing for example by #REF. <CONT> #TARGET_REF<BACKGROUND/> modified the representation of several constructions in several languages and obtained a consistent improvement in parsing accuracy. </CONT> In this paper, we will investigate the case of the verb group construction and attempt to reproduce the study by #REF on UD treebanks to find out whether or not the alternative representation is useful for parsing with UD.


 29%|██▉       | 44/150 [09:17<17:53, 10.13s/it]

4. Transform the parsed data back to the original representation (for comparison with the original gold standard). <CONT> #TARGET_REF<BACKGROUND/> have shown that these same modifications as well as the modification of nonprojective structures helps parsing in four languages. </CONT> #REF conducted a study over the alternative representations of 6 constructions across 5 parsing models for English and found that some of them are easier to parse than others. Their results were consistent across parsing models. The motivations behind those two types of studies are different.


 30%|███       | 45/150 [09:26<17:26,  9.97s/it]

In the PDT, main verbs are the head of auxiliary dependencies, as in Figure 1. <CONT> #TARGET_REF<BACKGROUND/> show that making the auxiliary the head of the dependency as in Figure 2 is useful for parsing Czech and Slovenian. </CONT> #REF verb groups are easier to parse when the auxiliary is the head (as in PDT) than when the verb is the head (as in MS). Since UD adopts the PDT style representation of verb groups, it would be interesting to find out whether or not transforming them to MS could also improve parsing. This is what will be attempted in this study.


 31%|███       | 46/150 [09:39<18:35, 10.72s/it]

<CONT> In this paper, we have attempted to reproduce a study by #TARGET_REF<USE/><DIFFERENCE/> that has shown that making auxiliaries heads in verb groups improves parsing but failed to show that those results port to parsing with Universal Dependencies. Contrary to expectations, the study has given evidence that main verbs should stay heads of auxiliary dependency relations for parsing with UD. </CONT> The benefits of error analyses for such a study have been highlighted because they allow us to shed more light on the different ways in which the transformations affect the parsing output. Experiments suggest that gains obtained from verb group transformations in previous studies have been obtained mainly because those transformations help disambiguating between main verbs and auxiliaries. It is however still an open question why the VG transformation hurts parsing accuracy in the case of UD.


 31%|███▏      | 47/150 [09:50<18:59, 11.06s/it]

It has been done for constituency parsing for example by #REF but also for dependency parsing for example by #REF. #REF modified the representation of several constructions in several languages and obtained a consistent improvement in parsing accuracy. <CONT> In this paper, we will investigate the case of the verb group construction and attempt to reproduce the study by #TARGET_REF<USE/> on UD treebanks to find out whether or not the alternative representation is useful for parsing with UD. </CONT> have shown that modifying coordination constructions and verb groups from their representation in the Prague Dependency Treebank (henceforth PDT) to a representation described in Melčuk (1988) (Mel'čuk style, henceforth MS) improves dependency parsing for Czech. The procedure they follow is as follows:


 32%|███▏      | 48/150 [10:04<19:58, 11.75s/it]

<CONT> We will follow the methodology from #TARGET_REF<USE/>, that is, to transform, parse and then detransform the data so as to compare the original and the transformed model on the original gold standard. </CONT> The method from #REF which consists in comparing the baseline and the transformed data on their respective gold standard is less relevant here because UD is believed to be a useful representation and that the aim will be to improve parsing within that representation. However, as was argued in that study, their method can give an indication of the learnability of a construction and can potentially be used to understand the results obtained by the parse-transform-detransform method. For this reason, this method will also be attempted. In addition, the original parsed data will also be transformed into the MS gold standard for comparison with the MS parsed data on the MS gold standard.


 33%|███▎      | 49/150 [10:14<19:08, 11.37s/it]

Dutch was discarded because the back transformation accuracy was low (90%). This is due to inconsistencies in the annotation: verb groups are annotated as a chain of dependency relations. This leaves us with a total of 25 out of the 37 treebanks. <CONT> For comparability with the study in #TARGET_REF<USE/>, and because we used a slightly modified version of their algorithm, we also tested the approach on the versions of the Czech and Slovenian treebanks that they worked on, respectively version 1.0 of the PDT (#REF ) and the 2006 version of SDT (#REF). </CONT> overview of the data used for the experiments.


 33%|███▎      | 50/150 [10:27<19:51, 11.92s/it]

Recently, several models and variants have been proposed with increased research efforts towards multilingual machine translation (#REF; #REF; #REF; #REF; #REF). The main motivation of multilingual models is the effect of transfer learning that enables machine translation systems to benefit from relationships between languages and training signals that come from different datasets (#REF; #REF; #REF). Another aspect that draws interest in translation models is the effective computation of sentence representations using the translation task as an auxiliary semantic signal (#REF; #REF; #REF; #REF ). <CONT> An important feature that enables an immediate use of the MT-based representations in other downstream tasks is the creation of fixed-sized sentence embeddings #TARGET_REF<BACKGROUND/>. </CONT> However, the effects of the size of sentence embeddings and the relation between translation performance and meaning representation quality are not entirely clear.


 34%|███▍      | 51/150 [10:40<20:00, 12.13s/it]

An important feature that enables an immediate use of the MT-based representations in other downstream tasks is the creation of fixed-sized sentence embeddings (Cífka and #REF). However, the effects of the size of sentence embeddings and the relation between translation performance and meaning representation quality are not entirely clear. <CONT> Recent studies based on NMT either focus entirely on the use of MT-based sentence embeddings in other tasks (#REF), on translation quality (#REF), on speed comparison (#REF), or only exploring a bilingual scenario #TARGET_REF<BACKGROUND/>. </CONT> In this paper, we are interested in exploring a cross-lingual intermediate shared layer (called attention bridge) in an attentive encoder-decoder MT model. This shared layer serves as a fixedsize sentence representation that can be straightforwardly applied to downstream tasks.


 35%|███▍      | 52/150 [10:51<18:58, 11.62s/it]

The overall architecture is illustrated in Figure 1 (see also Vázquez et al., 2019). <CONT> Due to the attentive connection between encoders and decoders we call this layer attention bridge, and its architecture is an adaptation from the model proposed by #TARGET_REF<USE/>. </CONT> Finally, each decoder follows a common attention mechanism in NMT, with the only exception that the context vector is computed on the attention bridge, and the initialization is performed by a mean pooling over it. Hence, the decoder receives the information only through the shared attention bridge. The fixed-sized representation coming out of the shared layer can immediately be applied to downstream tasks.


 35%|███▌      | 53/150 [10:58<16:47, 10.39s/it]

Table 2 : Results from supervised similarity tasks (SICK-R and STSB), measured using Pearson's (r) and Spearman's (ρ) correlation coefficients (r/ρ). The average across unsupervised similarity tasks on Pearson's measures are displayed in the right-most column. <CONT> Results with † taken from #TARGET_REF<USE/> of multilingual training. </CONT> We can see that multilingual training objectives are generally helpful for the trainable downstream tasks.


 36%|███▌      | 54/150 [11:08<16:35, 10.37s/it]

We are also interested in the translation quality to verify the appropriateness of our models with respect to the main objective they are trained for. For this, we adopt the in-domain development and evaluation dataset from the ACL-WMT07 shared task. Sentences are encoded using Byte-Pair Encoding (#REF), with 32,000 merge operations for each language. <CONT> 4 SentEval: Classification tasks Table 1 shows the performance of our models on two popular tasks (SNLI and SICK-E) as in #TARGET_REF<USE/> as well as the average of all 10 SentEval downstream tasks. </CONT> The experiments reveal two important findings:


 37%|███▋      | 55/150 [11:19<16:26, 10.38s/it]

Baseline system in the right-most column. model is provided by a bilingual setting with only one attention head. <CONT> This is in line with the findings of #TARGET_REF<SIMILARITY/> and could also be expected as the model is more strongly pushed into a dense semantic abstraction that is beneficial for measuring similarities without further training. </CONT> More surprising is the negative effect of the multilingual models. We believe that the multilingual information encoded jointly in the attention bridge hampers the results for the monolingual semantic similarity measured with the cosine distance, while it becomes easier in a bilingual scenario where the vector encodes only one source language, English in this case.


 37%|███▋      | 56/150 [11:30<16:37, 10.61s/it]

#REF proposed sequence-to-sequence learning for question generation from text passages. <CONT> #TARGET_REF<BACKGROUND/> utilized the answer-position, and linguistic features such as named entity recognition (NER) and parts of speech (POS) information to further improve the QG performance as the model is aware that for which answer a question need to be generated. </CONT> In the work of a multi-perspective context matching algorithm is employed. #REF use a set of rich linguistic features along with a NQG model. (#REF) used the matching algorithm proposed by to compute the similarity between the target answer and the passage for collecting relevant contextual information under the different perspective, so that contextual information can be better considered by the encoder.


 38%|███▊      | 57/150 [11:41<16:26, 10.61s/it]

<CONT> In previous works #TARGET_REF<BACKGROUND/>; #REF), named entity type features have been used. These features, however, only allow for the encoding of coarse level information such as knowledge of if an entity belongs to a set of predefined categories such as 'PERSON', 'LOCATION' and 'ORGANI-ZATION'. To alleviate this, we use the knowledge in the form of linked entities. </CONT> In our experiments, we use Wikipedia as the knowledge base for which to link entities. This specific task (also known as Wikification (#REF) ) is the task of identifying concepts and entities in text and disambiguation them into the most specific corresponding Wikipedia pages.


 39%|███▊      | 58/150 [11:48<15:01,  9.79s/it]

We evaluated the performance of our approach on SQuAD (#REF) and MS MARCO v2.1 (#REF). SQuAD is composed of more than 100K questions posed by crowd workers on 536 Wikipedia articles. <CONT> We used the same split as #TARGET_REF<USE/>. </CONT> MS MARCO datasets contains 1 million queries with corresponding answers and passages. All questions are sampled from real anonymized user queries and context passages are extracted from real web documents.


 39%|███▉      | 59/150 [12:05<17:44, 11.70s/it]

<CONT> PredPatt 1 #TARGET_REF<BACKGROUND/> ) is a pattern-based framework for predicate-argument extraction. </CONT> It defines a set of interpretable, extensible and non-lexicalized patterns based on Universal Dependencies (UD) (de #REF), and extracts predicates and arguments through these manual patterns. Figure 1 shows the predicates and arguments extracted by PredPatt from the sentence: "Chris, the designer, wants to launch a new brand." The underlying predicate-argument structure constructed by PredPatt is a directed graph, where a special dependency ARG is built between a predicate head token and its arguments' head tokens, and the original UD relations are retained within predicate phrases and argument phrases. For example, Figure 2 shows the directed graph for the predicate-argument extraction (1) and (2) in Figure 1. Compared to other existing systems for predicate-argument extraction (#REF; #REF; #REF), the use of manual language-agnostic patterns on UD makes PredPatt a well-

 40%|████      | 60/150 [12:16<17:13, 11.48s/it]

#REF uses PredPatt to help augmenting data with Universal Decompositional Semantics. #REF adapts PredPatt to data generation for cross-lingual open information extraction. <CONT> However, the evaluation of PredPatt has been restricted to manually-checked extractions over a small set of sentences #TARGET_REF<BACKGROUND/>, which lacks gold annotations to conduct an objective and reproducible evaluation, and inhibits the updates of patterns in PredPatt. </CONT> Chris, the designer, wants to launch a new brand. In this work, we aim to conduct a large-scale and reproducible evaluation of PredPatt by introducing a large set of gold annotations gathered from PropBank (#REF).


 41%|████      | 61/150 [12:27<17:02, 11.49s/it]

<CONT> PredPatt extracts predicates and arguments in four stages #TARGET_REF<BACKGROUND/> : (1) predicate and argument root identification, (2) argument resolution, (3) predicate and argument phrase extraction, and (4) optional post-processing. </CONT> We analyze PredPatt extraction in each of these stages on the held-out set, and make 19 improvements to PredPatt patterns. Due to lack of space, we only highlight one improvement for each stage below. Fixed-MWE-pred: The UD version 2.0 introduces a new dependency relation fixed for identifying fixed function-word "multiword expressions" (MWEs). To accommodate this new feature, we add patterns to identify the MWE predicate and its argument.


 41%|████▏     | 62/150 [12:35<15:18, 10.44s/it]

Neural machine translation (NMT) (#REF; #REF) is rapidly proving itself to be a strong competitor to other statistical machine translation methods. <CONT> However, it still lags behind other statistical methods on very lowresource language pairs #TARGET_REF<BACKGROUND/>; #REF). </CONT> A common strategy to improve learning of lowresource languages is to use resources from related languages (#REF). However, adapting these resources is not trivial. NMT offers some simple ways of doing this.


 42%|████▏     | 63/150 [12:51<17:23, 11.99s/it]

In this paper, we explore the opposite scenario, where the parent language pair is also lowresource, but related to the child language pair. <CONT> We show that, at least in the case of three Turkic languages (Turkish, Uzbek, and Uyghur), the original method of #TARGET_REF<DIFFERENCE/> does not always work, but it is still possible to use the parent model to considerably improve the child model. </CONT> The basic idea is to exploit the relationship between the parent and child language lexicons. Zoph et al.'s original method makes no assumption about the relatedness of the parent and child languages, so it effectively makes a random assignment of the parent-language word embeddings to child-language words. But if we assume that the parent and child lexicons are related, it should be beneficial to transfer source word embeddings from parent-language words to their child-language equivalents. Thus, the problem amounts to finding a representation of the data that ensures a sufficient over

 43%|████▎     | 64/150 [13:00<15:55, 11.11s/it]

<CONT> We follow the transfer learning approach proposed by #TARGET_REF<USE/>. In their work, a parent model is first trained on a high-resource language pair. Then the child model's parameter values are copied from the parent's and are fine-tuned on its low-resource data. The source word embeddings are copied with the rest of the model, with the ith parent-language word embedding being assigned to the ith childlanguage word. Because the parent and child source languages have different vocabularies, this amounts to randomly assigning parent source word embeddings to child source words. </CONT>


 43%|████▎     | 65/150 [13:09<15:01, 10.61s/it]

<CONT> The basic idea of our method is to extend the transfer method of #TARGET_REF<EXTENDS/> to share the parent and child's source vocabularies, so that when source word embeddings are transferred, a word that appears in both vocabularies keeps its embedding. </CONT> In order for this to work, it must be the case that the parent and child languages have considerable vocabulary overlap, and that when a word occurs in both languages, it often has a similar meaning in both languages. Thus, we need to process the data to make these two assumptions hold as much as possible.


 44%|████▍     | 66/150 [13:21<15:21, 10.96s/it]

We also optimized the vocabulary size and the number of BPE operations for the word-based and BPEbased systems, respectively, to maximize the tokenized BLEU on the development set. After translation at test time, we rejoined BPE segments, recased, and detokenized. Finally, we evaluated using case-sensitive BLEU. As a baseline, we trained a child model using BPE but without transfer (that is, with weights randomly initialized). <CONT> We also compared against a word-based baseline (without transfer) and two word-based systems using transfer without vocabulary-sharing, corresponding with the method of #TARGET_REF<USE/> ( §2.2): one where the target word embeddings are fine-tuned, and one where they are frozen. </CONT>


 45%|████▍     | 67/150 [13:30<14:35, 10.54s/it]

<CONT> In this paper, we have shown that the transfer learning method of #TARGET_REF<DIFFERENCE/>, while appealing, might not always work in a low-resource context. However, by combining it with BPE, we can improve NMT performance on a low-resource language pair by exploiting its lexical similarity with another related, low-resource language. </CONT> Our results show consistent improvement in two Turkic languages. Our approach, which relies on segmenting words into subwords, seems well suited to agglutinative languages; further investigation would be needed to confirm whether our method works on other types of languages.


 45%|████▌     | 68/150 [13:39<13:39,  9.99s/it]

However, it still lags behind other statistical methods on very lowresource language pairs (#REF; #REF). A common strategy to improve learning of lowresource languages is to use resources from related languages (#REF). However, adapting these resources is not trivial. NMT offers some simple ways of doing this. <CONT> For example, #TARGET_REF<BACKGROUND/> train a parent model on a (possibly unrelated) high-resource language pair, then use this model to initialize a child model which is further trained on a low-resource language pair. </CONT>


 46%|████▌     | 69/150 [13:51<14:14, 10.55s/it]

It is unclear whether such an approach can be directly applied to E2E models. <CONT> Phoneme-only E2E systems have been shown to have inferior performance compared to grapheme or wordpiece models (WPM) in general #TARGET_REF<BACKGROUND/> 17], but shows better recognition of rare words and proper nouns. </CONT> In this work we propose to incorporate phonemes to a wordpiece E2E model as modeling units and use phoneme-level FST for contextual biasing. We propose a word-frequency based sampling strategy to randomly tokenize rare words into phonemes in the target sequence using a lexicon. This approach also mitigates accuracy regressions that have been observed when using phoneme-only E2E models [16, 17].


 47%|████▋     | 70/150 [14:04<14:51, 11.15s/it]

Phoneme-only E2E systems have been shown to have inferior performance compared to grapheme or wordpiece models (WPM) in general [16, 17], but shows better recognition of rare words and proper nouns. In this work we propose to incorporate phonemes to a wordpiece E2E model as modeling units and use phoneme-level FST for contextual biasing. We propose a word-frequency based sampling strategy to randomly tokenize rare words into phonemes in the target sequence using a lexicon. <CONT> This approach also mitigates accuracy regressions that have been observed when using phoneme-only E2E models #TARGET_REF<BACKGROUND/> 17]. </CONT> We train our model using only American English data and thus its wordpieces and phoneme set (no data from foreign languages).


 47%|████▋     | 71/150 [14:13<14:03, 10.68s/it]

The output of the model is a single softmax whose symbol set is the union of wordpiece and phoneme symbols. We use a pronunciation lexicon to obtain phoneme sequences of words. <CONT> Since phonemes show strength in recognizing rare words #TARGET_REF<USE/>, we want to present these words as phonemes more often. </CONT> In a target sentence, we decide to randomly present the i th word as phonemes with a probability, 1.0) where p0 and T are constants and c(i) is an integer representing the number of time the word appears in our entire training corpus.


 48%|████▊     | 72/150 [14:24<13:48, 10.63s/it]

, 1.0) where p0 and T are constants and c(i) is an integer representing the number of time the word appears in our entire training corpus. Therefore, the words that appear T times or less will be presented as phonemes with probability p0. For words that appear more than T times, the more frequent they are, the less likely they are presented as phonemes 2. Note that the decision of whether to use wordpieces or phonemes is made randomly at each gradient iteration, and thus a given sentence could have different target sequences at different epochs. <CONT> We use context-independent phonemes as in #TARGET_REF<USE/>. </CONT>


 49%|████▊     | 73/150 [14:35<13:50, 10.78s/it]

<CONT> To generate words as outputs, we search through a decoding graph similar to #TARGET_REF<EXTENDS/> but accept both phonemes and wordpieces. </CONT> An example is shown in Figure 2. The decoding FST has wordpiece loops around state 0 (we show only a few for simplicity), but also has a pronunciation section (states 1 through 14). The pronunciation section is a prefix tree with phonemes as inputs, and outputs are wordpieces of the corresponding word produced by the WPM in Section 3.1. Specifically, for each word in the biasing list, we look up pronunciations from the lexicon and split the word into its constituent wordpieces.


 49%|████▉     | 74/150 [14:49<14:53, 11.75s/it]

Secondly, we see in Table 1 that all models performs substantially better with biasing. The WER reductions range from 9%-23% relatively for different models when compared to the no-bias case. Comparing different biasing strategies, we find that the wordpiece-phoneme model performs the best: 16% relatively better than the grapheme model, and 8.3% better than the wordpiece model. <CONT> We attribute the superior per- formance of the wordpiece-phoneme model to the robustness of phonemes to OOV words, as observed in #TARGET_REF<SIMILARITY/>. </CONT> Since the wordpiece-phoneme model contains both wordpieces and phonemes as modeling units, we can further perform wordpiece biasing in addition to phoneme-based biasing by building a wordpiece FST in parallel to the phoneme FST.


 50%|█████     | 75/150 [14:59<14:08, 11.32s/it]

The wordpiecephoneme model performs a little better than the grapheme model, and we attribute that to the higher frequency of wordpieces during training. Compared to the wordpiece model, the wordpiece-phoneme model has a slight degradation (0.1% absolute WER). This is due to the introduction of phonemes in modeling. One potential approach to improve regression is to incorporate an English external language model for phonemes in rescoring, similarly to the wordpiece-based rescoring in [10]. <CONT> However, we note that the regression is significantly smaller than the all-phoneme model in #TARGET_REF<DIFFERENCE/>. </CONT>


 51%|█████     | 76/150 [15:10<13:39, 11.07s/it]

Figure 2 : Decoding graph for the words "crèche" (daycare) with English cross lingual pronunciation "k r\ E S" and "créteil" (a city) with pronunciation "k r\ E t E j". For clarity, we omitted most wordpieces for the state 0. <CONT> Based on #TARGET_REF<USE/>, we add two improvements to the decoding strategy. </CONT> First, during decoding we consume as many input epsilon arcs as possible thus guaranteeing that all wordpieces in word are produced when all corresponding phonemes are seen in the input. Second, we merge paths that have the same output symbols.


 51%|█████▏    | 77/150 [15:36<18:54, 15.55s/it]

This tutorial introduces the advances in deep Bayesian learning with abundant applications for natural language understanding ranging from speech recognition (#REF; #REF) to document summarization (#REF ), text classification (#REF; #REF), text segmentation (#REF), information extraction (#REF), image caption generation (#REF; #REF), sentence generation (#REFb), dialogue control (#REF; #REFa), sentiment classification, recommendation system, question answering (#REF) and machine translation, to name a few. Traditionally, "deep learning" is taken to be a learning process where the inference or optimization is based on the real-valued deterministic model. The "semantic structure" in words, sentences, entities, actions and documents drawn from a large vocabulary may not be well expressed or correctly optimized in mathematical logic or computer programs. The "distribution function" in discrete or continuous latent variable model for natural language may not be properly decomposed or estima

 52%|█████▏    | 78/150 [15:44<15:56, 13.28s/it]

<CONT> Most state-of-the-art event trigger labeling approaches (#REF; #REFb; #REF; #TARGET_REF<BACKGROUND/> follow the standard supervised learning paradigm. </CONT> For each event type, experts first write annotation guidelines. Then, annotators follow them to label event triggers in a large dataset. Finally, a classifier is trained over the annotated triggers to label the target events. The supervised paradigm requires major human efforts both in producing high-quality guidelines and in dataset annotation for each new event type.


 53%|█████▎    | 79/150 [15:58<16:07, 13.62s/it]

Then, whenever a new event type is introduced for labeling, we only need to collect a seed list for it from its description, and provide it as input to the system. <CONT> We developed a seed-based system (Section 3), based on a state-of-the-art fully-supervised event extraction system #TARGET_REF<EXTENDS/>. </CONT> When evaluated on the ACE-2005 dataset, 1 our system outperforms the fully-supervised one (Section 4), even though we don't utilize any annotated triggers of the test events during the labeling phase, and only Figure 1 : Flow of the seed-based approach use the seed triggers appearing in the ACE annotation guidelines. This result contributes to the broader line of research on avoiding or reducing annotation cost in information extraction (Section 5). In particular, it suggests the potential utility of the seed-based approach in scenarios where manual annotation per each new event is too costly.


 53%|█████▎    | 80/150 [16:07<14:10, 12.14s/it]

This section describes the method we designed to implement the seed-based approach. To assess our approach, we compare it (Section 4) with the common fully-supervised approach, which requires annotated triggers for each target event type. <CONT> Therefore, we implemented our system by adapting the state-of-the-art fully-supervised event extraction system of #TARGET_REF<EXTENDS/><USE/>, modifying mechanisms relevant for features and for trigger labels, as described below. </CONT> Hence the systems are comparable with respect to using the same preprocessing and machine learning infrastructure.


 54%|█████▍    | 81/150 [16:20<14:15, 12.40s/it]

<CONT> To implement the seed-based approach for trigger labeling, we adapt only the trigger classification part in the #TARGET_REF<EXTENDS/> fully-supervised system, ignoring arguments. </CONT> Given a set of new target event types T we classify every test sentence once for each event type t ∈ T. Hence, when classifying a sentence for t, the labeling of each token x i is binary, where y i ∈ {, ⊥} marks whether x i is a trigger of type t ( ) or not (⊥). For instance x i ="visited" labeled as when classifying for t=Meet, means x i is labeled as a Meet trigger. To score the binary label assignment (x i, y i ), we use a small set of features that assess the similarity between x i and t's given seed list.


 55%|█████▍    | 82/150 [16:32<13:59, 12.34s/it]

We evaluate our seed-based approach (Section 2) in comparison to the fully-supervised approach implemented by #REF (Section 3). To maintain comparability, we use the ACE-2005 documents with the same split as in (#REF; #REFb; #REF) to 40 test documents and 559 training documents. <CONT> However, some evaluation settings differ: #TARGET_REF<DIFFERENCE/> train a multi-class model for all 33 ACE-2005 event types, and classify all tokens in the test documents into these event types. Our approach, on the other hand, trains an eventindependent binary classifier, while testing on new event types that are different from those utilized for training. </CONT> We next describe how this setup is addressed in our evaluation.


 55%|█████▌    | 83/150 [16:45<14:00, 12.54s/it]

<CONT> The event extraction system of #TARGET_REF<BACKGROUND/> labels triggers and their arguments for a set of target event types L, for which annotated training documents are provided. </CONT> The system utilizes a structured perceptron with beam search (#REF;. To label triggers, the system scans each sentence x, and creates candidate assignments y, that for each token x i assign each possible label y i ∈ L ∪ {⊥} (⊥ meaning x i is not a trigger at all). The score of an assignment (x i, y i ) is calculated as w · f, where f is the binary feature vector calculated for (x i, y i ), and w is the learned feature weight vector. The classifier's features capture various properties of x i and its context, such as its unigram and its containing bigrams.


 56%|█████▌    | 84/150 [16:57<13:38, 12.41s/it]

<CONT> We evaluate our seed-based approach (Section 2) in comparison to the fully-supervised approach implemented by #TARGET_REF<USE/> (Section 3). </CONT> To maintain comparability, we use the ACE-2005 documents with the same split as in (#REF; #REFb; #REF) to 40 test documents and 559 training documents. However, some evaluation settings differ: #REF train a multi-class model for all 33 ACE-2005 event types, and classify all tokens in the test documents into these event types. Our approach, on the other hand, trains an eventindependent binary classifier, while testing on new event types that are different from those utilized for training. We next describe how this setup is addressed in our evaluation.


 57%|█████▋    | 85/150 [17:09<13:19, 12.30s/it]

We evaluate our seed-based approach (Section 2) in comparison to the fully-supervised approach implemented by #REF (Section 3). <CONT> To maintain comparability, we use the ACE-2005 documents with the same split as in (#REF; #REFb; #TARGET_REF<USE/> to 40 test documents and 559 training documents. </CONT> However, some evaluation settings differ: #REF train a multi-class model for all 33 ACE-2005 event types, and classify all tokens in the test documents into these event types. Our approach, on the other hand, trains an eventindependent binary classifier, while testing on new event types that are different from those utilized for training. We next describe how this setup is addressed in our evaluation.


 57%|█████▋    | 86/150 [17:24<14:06, 13.23s/it]

80.6 67.1 73.2 0.04 #REF 73.7 62.3 67.5 - #REF 67.6 53.5 59.7 - Table 2 shows our system's precision, recall and F 1, 7 and the average variance of F 1 within the 10 runs of each test event type. The very low variance indicates that the system's performance does not depend much on the choice of training event types. <CONT> We compare our system's performance to the published trigger classification results of the baseline system of #TARGET_REF<USE/> ) (its globally optimized run, when labeling both triggers and arguments). </CONT> We also compare to the sentence-level system in (#REF) which uses the same dataset split. Our system outperforms the fully-supervised baseline by 5.7% F 1, which is statistically significant (two-tailed Wilcoxon test, p < 0.05).


 58%|█████▊    | 87/150 [17:35<12:56, 12.33s/it]

Representing sentences as numerical vectors while capturing their semantic context is an important and useful intermediate step in natural language processing. Representations that are both general and discriminative can serve as a tool for tackling various NLP tasks. <CONT> While common sentence representation methods are unsupervised in nature, recently, an approach for learning universal sentence representation in a supervised setting was presented in #TARGET_REF<BACKGROUND/>. </CONT> We argue that although promising results were obtained, an improvement can be reached by adding various unsupervised constraints that are motivated by auto-encoders and by language models. We show that by adding such constraints, superior sentence embeddings can be achieved.


 59%|█████▊    | 88/150 [17:47<12:41, 12.28s/it]

FastSent (#REF) learns to predicts a Bag-Of-Word (BOW) representation of adjacent sentences given a BOW representation of some sentence. In (#REF) a Hybrid Gaussian Laplacian density function is fitted to the sentence to derive Fisher Vectors. <CONT> While previous methods train sentence embeddings in an unsupervised manner, a recent work #TARGET_REF<BACKGROUND/> argued that better representations can be achieved via supervised training on a general sentence inference dataset (#REF). </CONT> To this end, the authors use the Stanford Natural Language Inference (SNLI) dataset (#REF) to train different Table 1 : Sentence embedding results. BiLSTM refers to the original BiLSTM followed by MaxPooling implementation of (#REF) which is the baseline for our work.


 59%|█████▉    | 89/150 [17:57<11:56, 11.75s/it]

<CONT> BiLSTM refers to the original BiLSTM followed by MaxPooling implementation of #TARGET_REF<USE/> which is the baseline for our work. </CONT> AE Reg and LM Reg refers to the Auto-Encoder and Language-Model regularization terms described in 2.1 and Combined refers to optimizing with both terms. Bi-AE Reg and Bi-LM Reg refers to the bi-directional Auto-Encoder and bi-directional Language-Model regularization terms described in 2.2. As evident from the results, adding simple unsupervised regularization terms improves the results of the model on almost all the evaluated tasks. sentence embedding methods and compare them on various benchmarks.


 60%|██████    | 90/150 [18:14<13:13, 13.23s/it]

Specifically, we use their BiLSTM model with max pooling. More concretely, given a sequence of T words, {w t } t=1,...,T with given word embedding (#REF; #REF) {v t } t=1,...,T,a bidirectional LSTM computes a set of T vectors {h t } t=1,...,T where each h t is the concatenation of a forward LSTM and a backward LSTM that read the sentences in two opposite directions. We denote { − → h t } and { ← − h t } as the hidden states of the left and right LSTM's respectively, where t = 1,..., T. The final sentence representation is obtained by taking the maximal value of each dimension of the {h t } hidden units (i.e.: max pooling). <CONT> The original model of #TARGET_REF<BACKGROUND/> was trained on the SNLI dataset in a supervised fashion -given pairs of sentences s 1 and s 2, denote their representation bys 1 and s 2. </CONT>


 61%|██████    | 91/150 [18:31<14:12, 14.44s/it]

<CONT> Following #TARGET_REF<USE/> we have tested our approach on a wide array of classification tasks, including sentiment analysis (MR -#REF, SST -#REF ), question-type (TREC -#REF ), product reviews (CR - #REF ), subjectivity/objectivity (SUBJ - #REF ) and opinion polarity (MPQA -#REF ). </CONT> We also tested our approach on semantic textual similarity (STS 14 - #REF ), paraphrase detection (MRPC - #REF ), entailment and semantic relatedness tasks (SICK-R and SICK-E - #REF ), though those tasks are more close in nature to the task of the SNLI dataset which the model was trained on. In our experiments we have set λ from eq. (1) and eq. (2) to be 1 and λ 1, λ 2 from eq. (3) and eq. (4) to be 0.5. All other hyper-parameters and implementation details were left unchanged to provide a fair comparison to the baseline method of (#REF). Our results are summarized in table 1.


 61%|██████▏   | 92/150 [18:44<13:20, 13.80s/it]

Following that, (#REF) proposed a dropout augmented LSTM. We note that there exists a connection between those two problems and try to model it more explicitly. Recently, the incorporation of the hidden states of neural language models in downstream supervised-learning models have been shown to improve the results of the latter (e.g. ElMo -#REF, CoVe -#REF Peters et al. (2017, #REF ) -in this work we jointly train the unsupervised and supervised tasks. <CONT> To this end, we incorporate unsupervised regularization terms motivated by language modeling and auto-encoders in the training framework proposed by #TARGET_REF<USE/>. </CONT> We test our proposed model on a set of NLP tasks and show improved results over the baseline framework of (#REF).


 62%|██████▏   | 93/150 [18:58<13:19, 14.02s/it]

<CONT> Our approach builds upon the previous work of #TARGET_REF<EXTENDS/>. Specifically, we use their BiLSTM model with max pooling. </CONT> More concretely, given a sequence of T words, {w t } t=1,...,T with given word embedding (#REF; #REF) {v t } t=1,...,T,a bidirectional LSTM computes a set of T vectors {h t } t=1,...,T where each h t is the concatenation of a forward LSTM and a backward LSTM that read the sentences in two opposite directions. We denote { − → h t } and { ← − h t } as the hidden states of the left and right LSTM's respectively, where t = 1,..., T. The final sentence representation is obtained by taking the maximal value of each dimension of the {h t } hidden units (i.e.: max pooling).


 63%|██████▎   | 94/150 [19:11<12:38, 13.54s/it]

Following that, (#REF) proposed a dropout augmented LSTM. We note that there exists a connection between those two problems and try to model it more explicitly. Recently, the incorporation of the hidden states of neural language models in downstream supervised-learning models have been shown to improve the results of the latter (e.g. ElMo -#REF, CoVe -#REF Peters et al. (2017, #REF ) -in this work we jointly train the unsupervised and supervised tasks. To this end, we incorporate unsupervised regularization terms motivated by language modeling and auto-encoders in the training framework proposed by (#REF). <CONT> We test our proposed model on a set of NLP tasks and show improved results over the baseline framework of #TARGET_REF<DIFFERENCE/>. </CONT>


 63%|██████▎   | 95/150 [19:22<11:46, 12.85s/it]

<CONT> Leveraging supervision given by some general task aided in obtaining state-of-the-art sentence representations #TARGET_REF<USE/>. </CONT> However, every supervised learning tasks is prone to overfit. In this context, overfitting to the learning task will result in a model which generalizes less well to new tasks. We alleviate this problem by incorporating unsupervised regularization criteria in the model's loss function which are motivated by autoencoders and language models. We note that the added regularization terms do come at the price of increasing the model size by ld parameters (where d and l are the dimensions of the word embedding and the LSTM hidden state, respectively) due to the added linear transformation (see 2.1).


 64%|██████▍   | 96/150 [19:34<11:27, 12.73s/it]

There are many methods for recognising timexes that are widely used in natural language engineering. For English (but not exclusively), in approaches based on supervised learning, sequence labelling methods are often used, especially Conditional Random #REF. <CONT> A review of the methods in the article #TARGET_REF<BACKGROUND/> about the recognition of timexes for English and Spanish has shown a certain shift within the most popular solutions. </CONT> As with the normalisation of timexes, the best results are still achieved with rule-based methods, many new solutions have been introduced in the area of recognition. The best systems listed in [35], called TIP#REF and ClearTK [1], use CRFs for recognition, so initially, we decided to apply the CRF-based approach for this task.


 65%|██████▍   | 97/150 [19:47<11:20, 12.84s/it]

A review of the methods in the article [35] about the recognition of timexes for English and Spanish has shown a certain shift within the most popular solutions. As with the normalisation of timexes, the best results are still achieved with rule-based methods, many new solutions have been introduced in the area of recognition. <CONT> The best systems listed in #TARGET_REF<BACKGROUND/>, called TIP#REF and ClearTK [1], use CRFs for recognition, so initially, we decided to apply the CRF-based approach for this task. </CONT> The results were described in [12, 10]. In recent years, solutions based on deep neural networks, using word representation in the form of word embeddings, created with the use of large linguistic corpus, have begun to dominate in the field of recognition of word expressions.


 65%|██████▌   | 98/150 [19:58<10:25, 12.03s/it]

<CONT> Experiments were carried out by the method proposed in #TARGET_REF<USE/>. </CONT> The first part is described as Task A, the purpose of which is to identify the boundaries of timexes and assign them to one of the following classes: 9 http://nlp.pwr.edu.pl/ 10 https://github.com/CLARIN-PL/PolDeepNer date, time, duration, set. [%]  all  1635  100  train  1227  50  test  408  25   Table 5 : Evaluation data sets (source: KPWr).


 66%|██████▌   | 99/150 [20:04<08:53, 10.45s/it]

<CONT> Then we evaluated these results using more detailed measures for timexes, presented in #TARGET_REF<USE/>. </CONT> The following measures were used to evaluate the quality of boundaries and class recognition, socalled strict match: strict precision (Str.P), strict recall (Str.R) and strict F1-score (Str.F1). A relaxed match (Rel. P, Rel. R, Rel.


 67%|██████▋   | 100/150 [20:13<08:13,  9.87s/it]

F1) evaluation has also been carried out to determine whether there is an overlap between the system entity and gold entity, e.g. [Sunday] and [Sunday morning] [35]. <CONT> If there was an overlap, a relaxed type F1-score (Type.F1) was calculated #TARGET_REF<USE/>. </CONT> The results are presented in Table 9. Table 6 : Evaluation results (precision) for 17 word embeddings models for each TIMEX3 class (date, time, duration and set).


 67%|██████▋   | 101/150 [20:30<09:53, 12.10s/it]

The ability of the model to provide vector representation for the unknown words seems to be the most important. Also, previous models built using KGR10 (EP) are probably less accurate due to an incorrect tokenisation of the corpus. We used WCRFT tagger [29], which utilises #REF to tokenise the input text before the creation of the embeddings model. The comparison of EC1 with previous results obtained using only CRF [9] show the significant improvement across all the tested metrics: 3.6pp increase in strict F1-score, 1.36pp increase in relaxed precision, 5.61pp increase in relaxed recall and 3.51pp increase in relaxed F1-score. <CONT> Table 9 : Evaluation results for all TIMEX3 classes (total) for 9 word embeddings models (3 best models from each embeddings group: EE, EP, EC from Table 8 ) using the following measures from #TARGET_REF<USE/> : strict precision, strict recall, strict F1-score, relaxed precision, relaxed recall, relaxed F1-score, type F1-score. </CONT>


 68%|██████▊   | 102/150 [20:43<09:51, 12.33s/it]

These models are believed to extract features that are robust to cross-domain variations. <CONT> However, while excelling on benchmark domain adaptation tasks such as cross-domain product sentiment classification #TARGET_REF<BACKGROUND/>, the reasons to this success are not entirely understood. </CONT> In the pre-NN era, a prominent approach to domain adaptation in NLP, and particularly in sentiment classification, has been structural correspondence learning (SCL) (#REF (#REF. Following the auxiliary problems approach to semi-supervised learning (#REF), this method identifies correspondences among features from different domains by modeling their correlations with pivot features: features that are frequent in both domains and are important for the NLP task. Non-pivot features from different domains which are correlated with many of the same pivot features are assumed to correspond, providing a bridge between the domains.


 69%|██████▊   | 103/150 [20:57<10:00, 12.78s/it]

However, while excelling on benchmark domain adaptation tasks such as cross-domain product sentiment classification (#REF), the reasons to this success are not entirely understood. <CONT> In the pre-NN era, a prominent approach to domain adaptation in NLP, and particularly in sentiment classification, has been structural correspondence learning (SCL) (#REF #TARGET_REF<BACKGROUND/>. </CONT> Following the auxiliary problems approach to semi-supervised learning (#REF), this method identifies correspondences among features from different domains by modeling their correlations with pivot features: features that are frequent in both domains and are important for the NLP task. Non-pivot features from different domains which are correlated with many of the same pivot features are assumed to correspond, providing a bridge between the domains. Elegant and well motivated as it may be, SCL does not form the state-of-the-art since the neural approaches took over.


 69%|██████▉   | 104/150 [21:09<09:36, 12.54s/it]

Below, we first discuss two prominent ideas in feature representation learning: pivot features and autoencoder neural networks. We then summarize our contribution in light of these approaches. <CONT> Pivot and Non-Pivot Features The definitions of this approach are given in Blitzer et al. (2006 #TARGET_REF<BACKGROUND/>, where SCL is presented in the context of POS tagging and sentiment classification, respectively. </CONT> Fundamentally, the method divides the shared feature space of both the source and the target domains to the set of pivot features that are frequent in both domains and are prominent in the NLP task, and a complementary set of non-pivot features. In this section we abstract away from the actual feature space and its division to pivot and non-pivot subsets.


 70%|███████   | 105/150 [21:21<09:15, 12.34s/it]

<CONT> An important observation of #TARGET_REF<BACKGROUND/>, is that some pivot features are similar to each other to the level that they indicate the same information with respect to the classification task. </CONT> For example, in sentiment classification with word unigram features, the words (unigrams) great and excellent are likely to serve as pivot features, as the meaning of each of them is preserved across domains. At the same time, both features convey very similar (positive) sentiment information to the level that a sentiment classifier should treat them as equals. The AE-SCL-SR model is based on two crucial observations. First, in many NLP tasks the pivot features can be pre-embeded into a vector space where pivots with similar meaning have similar vectors.


 71%|███████   | 106/150 [21:33<09:04, 12.38s/it]

We experiment with a 5-fold cross-validation on the source domain (#REF) : 1600 reviews for training and 400 reviews for development. <CONT> The test set for each target domain of #TARGET_REF<USE/> consists of all 2000 labeled reviews of that domain, and for the Blog domain it consists of the 7086 labeled sentences provided with the task dataset. </CONT> In all five folds half of the training examples and half of the development examples are randomly selected from the positive reviews and the other halves from the negative reviews. We report average results across these five folds, employing the same folds for all models. Hyper-parameter Tuning The details of the hyper-parameter tuning process for all models (including data splits to training, development and test sets) are described in the appendices.


 71%|███████▏  | 107/150 [21:50<09:43, 13.57s/it]

Under this model the probability of the i-th pivot feature to appear in an example is a (non-linear) function of the dot product of the feature's embedding vector and the network's hidden layer vector. As explained in Section 3, this approach encourages the model to learn similar hidden layers for documents that have different pivot features as long as these features have similar meaning. In sentiment classification, for example, although one positive review may use the unigram pivot feature excellent while another positive review uses the pivot great, as long as the embeddings of pivot features with similar meaning are similar (as expected from high quality embeddings) the hidden layers learned for both documents are biased to be similar. <CONT> We experiment with the task of cross-domain product sentiment classification of #TARGET_REF<USE/>, consisting of 4 domains (12 domain pairs) and further add an additional target domain, consisting of sentences extracted from social media blogs

 72%|███████▏  | 108/150 [22:09<10:39, 15.22s/it]

We denote the feature set in our problem with f, the subset of pivot features with f p ⊆ {1,..., |f |} and the subset of non-pivot features with f np ⊆ {1,..., |f |} such that f p ∪ f np = {1,..., |f |} and f p ∩ f np = ∅. We further denote the feature representation of an input example X with x. Following this notation, the vector of pivot features of X is denoted with x p while the vector of non-pivot features is denoted with x np. In order to learn a robust and compact feature representation for X we will aim to learn a nonlinear prediction function from x np to x p. As discussed in Section 4 the task we experiment with is cross-domain sentiment classification. <CONT> Following previous work (e.g. (#REF #TARGET_REF<USE/> #REF) our feature representation consists of binary indicators for the occurrence of word unigrams and bigrams in the represented document. </CONT> In what follows we hence assume that the feature representation x of an example X is a binary vector, and hence so are

 73%|███████▎  | 109/150 [22:21<09:45, 14.28s/it]

In this section we describe our experiments. To facilitate clarity, some details are not given here and instead are provided in the appendices. <CONT> Cross-domain Sentiment Classification To demonstrate the power of our models for domain adaptation we experiment with the task of crossdomain sentiment classification #TARGET_REF<USE/>. </CONT> The data for this task consist of Amazon product reviews from four product domains: Books (B), DVDs (D), Electronic items (E) and Kitchen appliances (K). For each domain 2000 labeled reviews are provided: 1000 are classified as positive and 1000 as negative, and these are augmented with unlabeled reviews: 6000 (B), 34741 (D), 13153 (E) and 16785 (K).


 73%|███████▎  | 110/150 [22:32<08:52, 13.32s/it]

Baselines Cross-domain sentiment classification has been studied in a large number of papers. However, the difference in preprocessing methods, dataset splits to train/dev/test subsets and the different sentiment classifiers make it hard to directly compare between the numbers reported in past. We hence compare our models to three strong baselines, running all models under the same conditions. We aim to select baselines that represent the state-of-the-art in cross-domain sentiment classification in general, and in the two lines of work we focus at: pivot based and autoencoder based representation learning, in particular. <CONT> The first baseline is SCL with pivot features selected using the mutual information criterion (SCL-MI, #TARGET_REF<USE/> ). </CONT>


 74%|███████▍  | 111/150 [22:44<08:26, 13.00s/it]

As in the other models, MSDA-DAN utilizes source domain labeled data as well as unlabeled data from both the source and the target domains at training time. <CONT> We experiment with a 5-fold cross-validation on the source domain #TARGET_REF<USE/> : 1600 reviews for training and 400 reviews for development. </CONT> The test set for each target domain of #REF consists of all 2000 labeled reviews of that domain, and for the Blog domain it consists of the 7086 labeled sentences provided with the task dataset. In all five folds half of the training examples and half of the development examples are randomly selected from the positive reviews and the other halves from the negative reviews. We report average results across these five folds, employing the same folds for all models.


 75%|███████▍  | 112/150 [23:05<09:47, 15.45s/it]

The number of pivots was chosen among {100, 200,..., 500} and the dimensionality of h among {100, 300, 500}. For the features induced by these models we take their w h x np vector. For AE-SCL-SR, embeddings for the unigram and bigram features were learned with word2vec (#REF). Details about the software and the way we learn bigram representations are in the appendices. <CONT> Baselines: For SCL-MI, following #TARGET_REF<USE/> we tuned the number of pivot features (#REF; #REF) between 500 and 1000 and the SVD dimensions among 50,100 and 150. </CONT> For MSDA we tuned the number of reconstructed features among {500, 1000, 2000, 5000, 10000}, the number of model layers among {1, 3, 5} and the corruption probability among {0.1, 0.2,..., 0.5}. For MSDA-DAN, we followed #REF : the λ adaptation parameter is chosen among 9 values between 10 −2 and 1 on a logarithmic scale, the hidden layer size l is chosen among {50, 100, 200} and the learning rate µ is 10 −3.


 75%|███████▌  | 113/150 [23:15<08:26, 13.69s/it]

Variants of the Product Review Data <CONT> There are two releases of the datasets of the #TARGET_REF<BACKGROUND/> cross-domain product review task. </CONT> We use the one from http://www.cs.jhu. edu/˜mdredze/datasets/sentiment/ index2.html where the data is imbalanced, consisting of more positive than negative reviews. We believe that our setup is more realistic as when collecting unlabeled data, it is hard to get a balanced set. Note that #REF used the other release where the unlabeled data consists of the same number of positive and negative reviews.


 76%|███████▌  | 114/150 [23:25<07:31, 12.53s/it]

We use the one from http://www.cs.jhu. edu/˜mdredze/datasets/sentiment/ index2.html where the data is imbalanced, consisting of more positive than negative reviews. We believe that our setup is more realistic as when collecting unlabeled data, it is hard to get a balanced set. <CONT> Note that #TARGET_REF<DIFFERENCE/> used the other release where the unlabeled data consists of the same number of positive and negative reviews. </CONT> Test Set Size While #REF used only 400 target domain reviews for test, we use the entire set of 2000 reviews.


 77%|███████▋  | 115/150 [23:35<06:51, 11.76s/it]

edu/˜mdredze/datasets/sentiment/ index2.html where the data is imbalanced, consisting of more positive than negative reviews. We believe that our setup is more realistic as when collecting unlabeled data, it is hard to get a balanced set. Note that #REF used the other release where the unlabeled data consists of the same number of positive and negative reviews. <CONT> Test Set Size While #TARGET_REF<DIFFERENCE/> used only 400 target domain reviews for test, we use the entire set of 2000 reviews. </CONT> We believe that this decision yields more robust and statistically significant results.


 77%|███████▋  | 116/150 [23:50<07:22, 13.00s/it]

They either use all the NPs as candidate mentions (Björkelund and #REF; #REF; #REF) or use the rule-based mention detector from the Stanford deterministic system (#REF) to extract mentions from NPs, named entity mentions and pronouns (#REF; #REFb). There are only very few studies that attempt to apply neural network approaches to the MD task. <CONT> Lee et al. (2017; #TARGET_REF<BACKGROUND/> first introduced a neural mention detector as a part of their end-to-end coreference system; however, the system does not output intermediate mentions, hence the mention detector cannot be used by other coreference systems directly. </CONT> To the best of our knowledge, #REF introduced the only standalone neural mention detector. By using a modified version of the NER system of #REF, they showed substantial performance gains at mention detection on the benchmark CONLL 2012 data set and on the CRAC 2018 data set when compared with the Stanford deterministic system (#REF).


 78%|███████▊  | 117/150 [24:02<06:59, 12.71s/it]

This move proved very effective; however, as a result the mention detection part of their system needs to be trained jointly with the coreference resolution part, hence can not be used separately. <CONT> The system has been later extended by #REF and #TARGET_REF<BACKGROUND/>. </CONT> #REF added biaffine attention to the coreference part of the #REF system, improving the system by 0.6%. Biaffine attention is also used in one of our approaches (BIAFFINE MD), but in a totally different manner, i.e. we use biaffine attention for mention detection while in #REF biaffine attention was used for computing mention-pair scores. The system is the current state-of-the-art coreference system.


 79%|███████▊  | 118/150 [24:15<06:43, 12.62s/it]

After training the system with the new setting, we get an average F1 of 72.6% (see table 4), which narrows the performance gap between the end-to-end system and the model trained without the joint learning. This confirms our first hypothesis that by downgrading the system to a pipeline setting does harm the overall performance of the coreference resolution. For our second experiment, we used the #REF instead. <CONT> The #TARGET_REF<BACKGROUND/> system is an extended version of the #REF system, hence they share most of the network architecture. </CONT> The #REF has a lower performance on mention detection (93.5% recall when λ = 0.4), which creates a large (4%) difference when compared with the recall of our BIAFFINE MD.


 79%|███████▉  | 119/150 [24:28<06:40, 12.91s/it]

The third system takes the outputs from BERT (#REF) and feeds them into a feed-forward neural network to classify candidates into mentions and non mentions. We evaluate these three models on both the CONLL and the CRAC data sets, with the following results. Firstly, we show that better mention performance of up to 1.5 percentage points 1 can be achieved by training the mention detector alone. Secondly, our best system achieves improvements of 5.3 and 6.5 percentage points when compared with #REF's neural MD system on CONLL and CRAC respectively. <CONT> Thirdly, by using better mentions from our mention detector, we can improve the end-to-end #TARGET_REF<EXTENDS/> system and the Clark and Manning (2016a) pipeline system by up to 0.7% and 1.7% respectively. </CONT>


 80%|████████  | 120/150 [24:39<06:06, 12.21s/it]

For the mention detection evaluation we use the #REF system as baseline. The baseline is trained end-toend on the coreference task and we use as baseline the mentions predicted by the system before carrying out coreference resolution. For the coreference evaluation we use the state-of-the-art #REF system as our baseline for the end-to-end system, and the Clark and Manning (2016a) system as our baseline for the pipeline system. <CONT> During the evaluation, we slightly modified the #TARGET_REF<EXTENDS/> system to allow the system to take the mentions predicted by our model instead of its internal mention detector. </CONT> Other than that we keep the system unchanged.


 81%|████████  | 121/150 [24:52<06:01, 12.45s/it]

(26.5% error reduction). Evaluation on the CRAC data set 3 For the CRAC data set, we train the #REF system end-to-end on the reduced corpus with singleton mentions removed and extract mentions from the system by set λ = 0.4. We then train our models with the same λ but on the full corpus, since our mention detectors naturally support both mention 3 As the #TARGET_REF<USE/> system does not predict singleton mentions, the results on CRAC data set in Table 2 are evaluated without singleton mentions. While the results reported in Table 3 are evaluated with singleton mentions included. 88.0 89.7 89.1 Table 3 : Comparison between our BIAFFINE MD and the top performing systems on the mention detection task using the CONLL and CRAC data sets.


 81%|████████▏ | 122/150 [25:01<05:16, 11.30s/it]

We then integrate the mentions predicted by our best system into the coreference resolution system to evaluate the effects of our better mention detectors on the downstream application. Evaluation with the end-to-end system. <CONT> We first evaluate our BIAFFINE MD in combination with the end-to-end #TARGET_REF<USE/> system. </CONT> We slightly modified the system to feed the system mentions predicted by our mention detector. As a result, the original mention selection function is switched off, we keep all the other settings (include the mention scoring function) unchanged.


 82%|████████▏ | 123/150 [25:14<05:22, 11.93s/it]

Three types of one-mode projections capture relations between sentences, P U, P W and P Acc. P U creates an edge between two sentences if they share at least one entity. P W captures the intuition that the connection between two sentences is stronger the more entities they share by means of weighted edges, where the weights equal the number of entities shared by sentences (#REF). The third type of projection, P Acc, integrates syntactic information in the edge weights calculated by the following formula: <CONT> While the entity grid #TARGET_REF<DIFFERENCE/> uses information about sentences which do not share entities by means of the "--" transition, the entity graph cannot employ this negative information. </CONT> Here, we propose a normalization for the entity graph and its corresponding one-mode projections which is based on the relative importance of entities and, in turn, the relative importance of sentences.


 83%|████████▎ | 124/150 [25:22<04:39, 10.74s/it]

Experimental setup and data follow #REF (61 documents from the English test part of the CoNLL 2012 shared task (#REF) ). For discrimination we use 20 permutations of each text. Table 1 shows the results. <CONT> Results for #REF, G&S, are reproduced, results for #TARGET_REF<USE/>, B&L, and #REF, E&C, were reproduced by #REF. </CONT> The unweighted graph, P U, does not need normalization.


 83%|████████▎ | 125/150 [25:32<04:24, 10.56s/it]

<CONT> In experiments, #TARGET_REF<USE/> assume that articles taken from Encyclopedia Britannica are more difficult to read (less coherent) than the corresponding articles from Encyclopedia Britannica Elementary, its version for children. We follow them with regard to data (107 article pairs), experimental setup and evaluation. </CONT> Sentences in the Britannica Elementary are simpler and shorter than in the Encyclopedia Britannica. The entity graph does not take into account the effect of entities not shared between sentences while the normalized entity graph assigns a lower weight if there are more of these entities. Hence, Britannica Elementary receives a higher cohesion score than Encyclopedia Britannica in our model.


 84%|████████▍ | 126/150 [25:41<04:03, 10.13s/it]

<CONT> We follow #TARGET_REF<USE/> for evaluating whether the normalized entity graph can decide whether automatic or human summaries are more coherent (80 pairs of summaries extracted from DUC 2003). </CONT> Human coherence scores are associated with each pair of summarized documents (#REF). Table 3 displays reported results of B&L and reproduced results of the entity graph and our normalized entity graph. Normalizing significantly improves the results for P W and P Acc. P U is still slightly better than both, but in contrast to the entity graph, this difference is not statistically significant.


 85%|████████▍ | 127/150 [25:50<03:45,  9.81s/it]

We follow #REF for evaluating whether the normalized entity graph can decide whether automatic or human summaries are more coherent (80 pairs of summaries extracted from DUC 2003). <CONT> Human coherence scores are associated with each pair of summarized documents #TARGET_REF<USE/>. </CONT> Table 3 displays reported results of B&L and reproduced results of the entity graph and our normalized entity graph. Normalizing significantly improves the results for P W and P Acc. P U is still slightly better than both, but in contrast to the entity graph, this difference is not statistically significant.


 85%|████████▌ | 128/150 [25:56<03:10,  8.67s/it]

We proposed a normalization method for the entity graph (#REF). <CONT> We compared our model to the entity graph and to the entity grid #TARGET_REF<USE/> and showed that normalization improves the results significantly in most tasks. </CONT> Future work will include adding more linguistic information, stronger weighting schemes and application to other readability datasets (#REF; De #REF).


 86%|████████▌ | 129/150 [26:04<02:54,  8.31s/it]

We aim to detect mentions of concepts without performing co-reference resolution or clustering mentions. <CONT> Therefore, our setting resembles the established task of entity recognition (#REF; #TARGET_REF<DIFFERENCE/>), with the difference being that we focus on un-named entities. </CONT> Contribution. One of the factors impeding progress in common sense information extraction is the lack of training data. It is relatively easy to obtain labeled data for named entities such as companies and people.


 87%|████████▋ | 130/150 [26:18<03:18,  9.94s/it]

We treat sense recognition in text as sequence prediction problem, we would like to estimate: P (y i |x i−k,..., x i+k ; y i−l,..., y i−1 ). where x refers Figure 4: Our neural network architecture for the task of recognizing concepts that are discernible by sensesss. to words, and y refers to BIO labels. <CONT> Conditional Random Fields (CRFs) (#REF ) have been widely used named entity recognition #TARGET_REF<BACKGROUND/>; #REF), a task similar to our own. </CONT> While the CRF models performed reasonably well on our task, we sought to obtain improvements by proposing and training variations of Long Short Memory (LSTM) recurrent neural networks (#REF). We found our variations of LSTM sequence classifiers to do better than the CRF model, and also better than standard LSTMs.


 87%|████████▋ | 131/150 [26:30<03:22, 10.63s/it]

Our task is related to entity recognition however in this paper we focused on novel types of entities, which can be used to improve extraction of common sense knowledge. <CONT> Entity recognition systems are traditionally based on a sequential model, for example a CRF, and involve feature engineering (#REF; #TARGET_REF<BACKGROUND/>. </CONT> More recently, neural approaches have been used for named entity recognition (#REF; #REF; dos Santos and Guimarães, 2015; #REF; #REF). Like other neural approaches, our approach does not require feature engineering, the only features we use are word and character embeddings. Related to our proposed recurrence in the output layer is the work of (#REF) which introduced a CRF on top of LSTM for the task of named entity recognition.


 88%|████████▊ | 132/150 [26:38<02:57,  9.86s/it]

We would like to detect mentions of concepts discernible by sense. In this paper, we focus on mentions of audible (sound) and olfactible (smell) concepts. <CONT> We treat sense recognition in text as a sequence labeling task where each sentence is a sequence of tokens labeled using the BIO tagging scheme #TARGET_REF<USE/>. </CONT> The BIO labels denote tokens at the beginning, inside, and outside of a relevant mention, respectively. Example BIO tagged sentences are shown in Figure 1.


 89%|████████▊ | 133/150 [26:51<03:04, 10.84s/it]

With these α values, the combination approach produced 1,962 and 1,702 training instances for audible and olfactible concepts respectively Performance of the various models is shown in Table 4. The abbreviations denote the following: LSTM refers to a vanilla LSTM model, using only word embeddings as features, + OR refers to the LSTM plus the output recurrence, + CHAR refers to the LSTM plus the character embeddings as features. + OR + CHAR refers to the LSTM plus the output recurrence and character embeddings as features. <CONT> For the CRF, we use the commonly used features for named entity recognition: words, prefix/suffices, and part-of-speech tag #TARGET_REF<USE/>. </CONT> We can see that for both senses, the model that uses both character embedding features, and an output recurrence layer yields the best F1 score.


 89%|████████▉ | 134/150 [27:03<02:57, 11.11s/it]

While these are important factors to consider, a sentence can be true or false regardless of whether it is a rumour (#REF). Existing fact checking systems are capable of detecting fact-check-worthy claims in text (#REFb), returning semantically similar textual claims (#REF) ; and scoring the truth of triples on a knowledge graph through semantic distance (#REF). However, neither of these are suitable for fact checking a claim made in natural language against a database. <CONT> Previous works appropriate for this task operate on a limited domain and are not able to incorporate temporal information when checking time-dependent claims #TARGET_REF<BACKGROUND/>. </CONT> In this paper we introduce our fact checking tool, describe its architecture and design decisions, evaluate its accuracy and discuss future work.


 90%|█████████ | 135/150 [27:12<02:37, 10.52s/it]

Previous works appropriate for this task operate on a limited domain and are not able to incorporate temporal information when checking time-dependent claims (#REF). In this paper we introduce our fact checking tool, describe its architecture and design decisions, evaluate its accuracy and discuss future work. We highlight the ease of incorporating new information sources to fact check, which may be unavailable during training. <CONT> To validate the extensibility of the system, we complete an additional evaluation of the system using claims taken from #TARGET_REF<USE/>. </CONT> We make the source code publicly available to the community.


 91%|█████████ | 136/150 [27:26<02:41, 11.56s/it]

<CONT> We further validate the system by evaluating the ability of this fact checking system to make veracity assessments on simple numerical claims from the data set collected by #TARGET_REF<USE/>. </CONT> Of the 4,255 claims about numerical properties about countries and geographical areas in this data set, our KB contained information to fact check 3,418. The system presented recalled KB entries for 3,045 claims (89.1%). We observed that the system was consistently unable to fact check two properties (undernourishment and renewable freshwater per capita). Analysis of these failure cases revealed too great a lexical difference between the test claims and the training data our system generated; the claims in the test cases were comparative in nature (e. g. country X has higher rate of undernourishment than country Y) whereas the training data generated using the method described in Section 3.2 are absolute claims.


 91%|█████████▏| 137/150 [27:38<02:30, 11.61s/it]

The core capability of the system demonstration we presented is to fact check natural language claims against relations stored in a KB. <CONT> Although the range of claims is limited, the system is a fieldtested prototype and has been evaluated on a published data set #TARGET_REF<BACKGROUND/> and on real-world claims presented as part of the HeroX fact checking challenge. </CONT> In future work, we will extend the semantic parsing technique used and apply our system to more complex claim types. Additionally, further work is required to reduce the number of candidate relations recalled from the KB. While this was not an issue in our case, we believe that ameliorating this issue will enhance the ability of the system to assign a correct truth label where there exist properties with similar numerical values.


 92%|█████████▏| 138/150 [27:52<02:31, 12.59s/it]

We developed our fact-checking approach in the context of the HeroX challenge 2 -a competition organised by the fact checking organization FullFact 3. The types of claims the system presented can fact check was restricted to those which require looking up a value in a KB, similar to the one in Figure 1. <CONT> To learn a model to perform the KB look up (essentially a semantic parsing task), we extend the work of #TARGET_REF<EXTENDS/> who used distant supervision (#REF ) to generate training data, obviating the need for manual labeling. </CONT> In particular, we extend it to handle simple temporal expressions in order to fact check time-dependent claims appropriately, i. e. population in 2015. While the recently proposed semantic parser of #REF is also able to handle temporal expressions, it makes the assumption that the table against which the claim needs to be interpreted is known, which is unrealistic in the context of fact checking.


 93%|█████████▎| 139/150 [28:06<02:20, 12.80s/it]

Part-of-speech (POS) tagging has received a great deal of attention as it is a critical component of most natural language processing systems. As supervised POS tagging accuracies for English (measured on the Wall Street Journal portion of the PennTreebank (#REF) ) have converged to around 97.3% (#REF; #REF), the attention has shifted to unsupervised approaches (#REF). <CONT> In particular, there has been growing interest in both multilingual POS induction ) and cross-lingual POS induction via treebank projection (#REF; #REF; #TARGET_REF<BACKGROUND/>. </CONT> Underlying these studies is the idea that a set of (coarse) syntactic POS categories exist in similar forms across languages. These categories are often called universals to represent their cross-lingual nature (#REF; #REF).


 93%|█████████▎| 140/150 [28:17<02:04, 12.40s/it]

Underlying these studies is the idea that a set of (coarse) syntactic POS categories exist in similar forms across languages. These categories are often called universals to represent their cross-lingual nature (#REF; #REF). For example, used the Multext-East (#REF) corpus to evaluate their multi-lingual POS induction system, because it uses the same tagset for multiple languages. When corpora with common tagsets are unavailable, a standard approach is to manually define a mapping from language and treebank specific fine-grained tagsets to a predefined universal set. <CONT> This was the approach taken by #TARGET_REF<BACKGROUND/> to evaluate their cross-lingual POS projection system for six different languages. </CONT>


 94%|█████████▍| 141/150 [28:30<01:52, 12.49s/it]

Finally, it also permits language technology practitioners to train POS taggers with common tagsets across multiple languages. This in turn facilitates downstream application development as there is no need to maintain language specific rules due to differences in treebank annotation guidelines. In this paper, we specifically highlight two use cases of this resource. First, using our universal tagset and mapping, we run an experiment comparing POS tag accuracies for 25 different treebanks to evaluate POS tagging accuracy on a single tagset. <CONT> Second, we combine the cross-lingual projection part-of-speech taggers of #TARGET_REF<USE/> with the grammar induction system of #REF -which requires a universal tagset -to produce a completely unsupervised grammar induction system for multiple languages, that does not require gold POS tags in the target language. </CONT>


 95%|█████████▍| 142/150 [28:39<01:32, 11.50s/it]

In our experiments, we did not make use of refined categories, as the POS tags induced by #REF were all coarse. <CONT> We present results on the same eight IndoEuropean languages as #TARGET_REF<USE/>, so that we can make use of their automatically projected POS tags. </CONT> For all languages, we used the treebanks released as a part of the CoNLL-X (#REF) shared task. We only considered sentences of length 10 or less, after the removal of punctuations. We performed Bayesian inference on the whole treebank and report dependency attachment accuracy.


 95%|█████████▌| 143/150 [28:49<01:16, 10.90s/it]

However, since this rule is reversed for other languages, we omit it in our tagset. Additionally, they also used refined categories in the form of CoNLL treebank tags. <CONT> In our experiments, we did not make use of refined categories, as the POS tags induced by #TARGET_REF<USE/> were all coarse. </CONT> We present results on the same eight IndoEuropean languages as #REF, so that we can make use of their automatically projected POS tags. For all languages, we used the treebanks released as a part of the CoNLL-X (#REF) shared task.


 96%|█████████▌| 144/150 [28:57<01:00, 10.08s/it]

Optimizers update the model parameters based on the gradients. We provide wrappers around most PyTorch optimizers and an implementation of Adafactor (#REF), which is a memory-efficient variant of Adam. Learning Rate Schedulers update the learning rate over the course of training. <CONT> We provide several popular schedulers, e.g., the inverse square-root scheduler from #TARGET_REF<USE/> and cyclical schedulers based on warm restarts (#REF). </CONT> Reproducibility and forward compatibility.


 97%|█████████▋| 145/150 [29:10<00:55, 11.15s/it]

We provide reference implementations of several popular sequence-to-sequence models which can be used for machine translation, including LSTM (#REF), convolutional models (#REF; #REF) and Transformer (#REF). We evaluate a "big" Transformer encoderdecoder model on two language pairs, WMT English to German (En-De) and WMT English to French (En-Fr). <CONT> For En-De we replicate the setup of #TARGET_REF<USE/> which relies on WMT'16 for training with 4.5M sentence pairs, we validate on newstest13 and test on newstest14. </CONT> The 32K vocabulary is based on a joint source and target byte pair encoding (BPE; #REF ). For En-Fr, we train on WMT'14 and borrow the setup of #REF with 36M training sentence pairs.


 97%|█████████▋| 146/150 [29:19<00:41, 10.50s/it]

The 40K vocabulary is based on a joint source and target BPE. We measure case-sensitive tokenized BLEU with multi-bleu (#REF) and detokenized BLEU with SacreBLEU 1 (#REF). <CONT> All results use beam search with a beam width of 4 and length penalty of 0.6, following #TARGET_REF<USE/>. </CONT> FAIRSEQ results are summarized in Table 2. We reported improved BLEU scores over #REF by training with a bigger batch size and an increased learning rate.


 98%|█████████▊| 147/150 [29:30<00:31, 10.48s/it]

After the FP16 gradients are synchronized between workers, we convert them to FP32, restore the original scale, and update the weights. Inference. <CONT> FAIRSEQ provides fast inference for non-recurrent models (#REF; #TARGET_REF<BACKGROUND/>; #REFb; #REF) through incremental decoding, where the model states of previously generated tokens are cached in each active beam and re-used. </CONT> This can speed up a naïve implementation without caching by up to an order of magnitude, since only new states are computed for each token. For some models, this requires a component-specific caching implementation, e.g., multi-head attention in the Transformer architecture.


 99%|█████████▊| 148/150 [29:43<00:22, 11.41s/it]

<CONT> We provide reference implementations of several popular sequence-to-sequence models which can be used for machine translation, including LSTM (#REF), convolutional models (#REF; #REF) and Transformer #TARGET_REF<BACKGROUND/>. </CONT> We evaluate a "big" Transformer encoderdecoder model on two language pairs, WMT English to German (En-De) and WMT English to French (En-Fr). For En-De we replicate the setup of #REF which relies on WMT'16 for training with 4.5M sentence pairs, we validate on newstest13 and test on newstest14. The 32K vocabulary is based on a joint source and target byte pair encoding (BPE; #REF ). For En-Fr, we train on WMT'14 and borrow the setup of #REF with 36M training sentence pairs.


 99%|█████████▉| 149/150 [29:54<00:11, 11.04s/it]

<CONT> FAIRSEQ supports language modeling with gated convolutional models and Transformer models #TARGET_REF<BACKGROUND/>. </CONT> Models can be trained using a variety of input and output representations, such as standard token embeddings, convolutional character embeddings (Kim 1 En-De En-Fr a. #REF 25.2 40.5 b. #REF 28.4 41.0 c. #REF 28.9 41.4 d. #REF 29 et al., 2016), adaptive softmax (#REF), and adaptive inputs. We also provide tutorials and pre-trained models that replicate the results of and


100%|██████████| 150/150 [30:04<00:00, 12.03s/it]

FAIRSEQ supports language modeling with gated convolutional models and Transformer models (#REF). <CONT> Models can be trained using a variety of input and output representations, such as standard token embeddings, convolutional character embeddings (Kim 1 En-De En-Fr a. #REF 25.2 40.5 b. #TARGET_REF<BACKGROUND/> 28.4 41.0 c. #REF 28.9 41.4 d. #REF 29 et al., 2016), adaptive softmax (#REF), and adaptive inputs. </CONT> We also provide tutorials and pre-trained models that replicate the results of and
Structure failed at function: is_similar_to_input, {'pattern': '<(?:BACKGROUND|USE|DIFFERENCE|SIMILARITY|MOTIVATION|EXTENDS|FUTURE)/>|</?CONT>'}
{'opening': [], 'closing': [], 'self_closing': ['<USE/>']}
Since DROP does not indicate the answer type but only provides the answer string, we therefore adopt the weakly supervised annotation scheme, as suggested in #REF; #TARGET_REF<USE/>. We find all possible annotations that point to the gold answer, including matching spans, arithmetic expres




Structure failed at function: is_similar_to_input, {'pattern': '<(?:BACKGROUND|USE|DIFFERENCE|SIMILARITY|MOTIVATION|EXTENDS|FUTURE)/>|</?CONT>'}
Structure failed at function: is_similar_to_input, {'pattern': '<(?:BACKGROUND|USE|DIFFERENCE|SIMILARITY|MOTIVATION|EXTENDS|FUTURE)/>|</?CONT>'}
{'opening': [], 'closing': [], 'self_closing': ['<USE/>']}
(26.5% error reduction). Evaluation on the CRAC data set 3 For the CRAC data set, we train the #REF system end-to-end on the reduced corpus with singleton mentions removed and extract mentions from the system by set λ = 0.4. We then train our models with the same λ but on the full corpus, since our mention detectors naturally support both mention 3 As the #TARGET_REF<USE/> system does not predict singleton mentions, the results on CRAC data set in Table 2 are evaluated without singleton mentions. While the results reported in Table 3 are evaluated with singleton mentions included. 88.0 89.7 89.1 Table 3 : Comparison between our BIAFFINE MD and