Skip to content

Commit

Permalink
Merge pull request EleutherAI#140 from cfoster0/master
Browse files Browse the repository at this point in the history
Implement SQuADv2 evaluation
  • Loading branch information
leogao2 committed Mar 28, 2021
2 parents caba51e + 4b133dc commit f984c88
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 20 deletions.
3 changes: 1 addition & 2 deletions lm_eval/evaluator.py
Expand Up @@ -48,7 +48,6 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):

reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]

for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
Expand Down Expand Up @@ -90,4 +89,4 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)

return results
return results
2 changes: 1 addition & 1 deletion lm_eval/tasks/__init__.py
Expand Up @@ -109,7 +109,7 @@
"hellaswag": hellaswag.HellaSwag, # not implemented yet
"openbookqa": openbookqa.OpenBookQA,
# "sat": sat.SATAnalogies, # not implemented yet
# "squad": squad.SQuAD, # not implemented yet
"squad2": squad.SQuAD2,
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA,
Expand Down
86 changes: 70 additions & 16 deletions lm_eval/tasks/squad.py
@@ -1,7 +1,23 @@
import datasets
from math import exp
from lm_eval.base import rf
from lm_eval.metrics import f1_score, mean
from . common import HFTask
from functools import partial


class SQuAD(HFTask):
def _squad_metric(predictions, references):
squad_metric = datasets.load_metric("squad_v2")
return squad_metric.compute(predictions=predictions, references=references)


def _squad_agg(key, items):
predictions, references = zip(*items)

return _squad_metric(predictions=predictions, references=references)[key]


class SQuAD2(HFTask):
DATASET_PATH = "squad_v2"
DATASET_NAME = None

Expand All @@ -15,16 +31,14 @@ def has_test_docs(self):
return False

def training_docs(self):
if self.has_training_docs():
return self.data["train"]
return self.data["train"]

def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
return self.data["validation"]

def fewshot_description(self):
# TODO: redo description
return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
# TODO: figure out description
return ""

def doc_to_text(self, doc):
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'
Expand All @@ -35,7 +49,7 @@ def doc_to_target(self, doc):
answer = answer_list[0]
else:
answer = 'unanswerable'
return answer
return " " + answer

def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Expand All @@ -48,8 +62,9 @@ def construct_requests(self, doc, ctx):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
continuation = rf.greedy_until(ctx, ['\n'])
is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
return continuation, is_unanswerable

def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
Expand All @@ -61,23 +76,62 @@ def process_results(self, doc, results):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
continuation, (logprob_unanswerable, _) = results

no_answer_probability = exp(logprob_unanswerable)

predictions = {
'id': doc['id'],
'prediction_text': continuation,
'no_answer_probability': no_answer_probability,
}

references = {
'id': doc['id'],
'answers': doc['answers'],
}

return {
'exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'HasAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'NoAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'best_exact': (predictions, references), # Best exact match (with varying threshold)
'best_f1': (predictions, references), # Best F1 (with varying threshold)
}

def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
'exact': partial(_squad_agg, 'exact'), # Exact match (the normalized answer exactly match the gold answer)
'f1': partial(_squad_agg, 'f1'), # The F-score of predicted tokens versus the gold answer
'HasAns_exact': partial(_squad_agg, 'HasAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': partial(_squad_agg, 'HasAns_f1'), # The F-score of predicted tokens versus the gold answer
'NoAns_exact': partial(_squad_agg, 'NoAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': partial(_squad_agg, 'NoAns_f1'), # The F-score of predicted tokens versus the gold answer
'best_exact': partial(_squad_agg, 'best_exact'), # Best exact match (with varying threshold)
'best_f1': partial(_squad_agg, 'best_f1'), # Best F1 (with varying threshold)
}

def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
'exact': True, # Exact match (the normalized answer exactly match the gold answer)
'f1': True, # The F-score of predicted tokens versus the gold answer
'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer
'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer
'best_exact': True, # Best exact match (with varying threshold)
'best_f1': True, # Best F1 (with varying threshold)
}
2 changes: 1 addition & 1 deletion tests/test_evaluator.py
Expand Up @@ -29,4 +29,4 @@ def ll_fn(reqs):


lm.loglikelihood = ll_fn
evaluator.evaluate(lm, task_dict, False, 0, 10)
evaluator.evaluate(lm, task_dict, False, 0, 10)

0 comments on commit f984c88

Please sign in to comment.