In [1]:
import glob
import logging
import os
import random
import shutil
import json

from typing import Dict, List, Tuple
from argparse import Namespace

import numpy as np
import sklearn

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

from tqdm import tqdm, trange
from tqdm.notebook import tqdm

from transformers import (
    AdamW,
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelWithLMHead,
    BertForMultipleChoice,
    RobertaForMultipleChoice,
    XLNetForMultipleChoice,
    GPT2LMHeadModel,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

from dataset import (
    ResponseGenerationDataset,
    ResponseGenerationEvalDataset,
    KnowledgeSelectionDataset,
    SPECIAL_TOKENS
)

from utils.argument import (
    set_default_params,
    set_default_dataset_params,
    update_additional_params,
    verify_args
)

from utils.data import write_selection_preds, write_detection_preds

logger = logging.getLogger(__name__)

## Set arguments for dataset, model, and training

In [2]:
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [3]:
args = Namespace(
    checkpoint = None,
    dataroot = 'data',
    device = "cuda" if torch.cuda.is_available() else "cpu",
    eval_all_snippets = False,
    eval_dataset = 'val',
    eval_desc = '',
    eval_only = False,
    exp_name = 'generation',
    history_max_tokens = -1,
    knowledge_file = 'knowledge.json',
    knowledge_max_tokens = -1,
    labels_file = None,
    local_rank = -1,
    model_name_or_path = 'gpt2',
    model_type = '',
    negative_sample_method = '',
    no_labels = False,
    output_dir = "outputs",
    params_file = "configs/generation_params.json",
    relevance_ratio = '1',
    task_dataset = ''
    )

In [4]:
with open(args.params_file, "r") as f:
    params = json.load(f)
    args = vars(args)

    update_additional_params(params, args)
    args.update(params)
    args = Namespace(**args)

In [5]:
args.params = params # used for saving checkpoints
set_default_params(args)
dataset_args = Namespace(**args.dataset_args)
set_default_dataset_params(dataset_args)
dataset_args.local_rank = args.local_rank
dataset_args.task = args.task

In [6]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# args.n_gpu = torch.cuda.device_count()
# args.device = device

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = 1
args.device = device

In [8]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

## Load the pre-trained model

In [9]:
config = AutoConfig.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
tokenizer.add_special_tokens(SPECIAL_TOKENS)
### Choose the model ###
model = AutoModelWithLMHead.from_pretrained(args.model_name_or_path, config=config)
model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50264, 768)

In [10]:
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

## Load dataset

In [11]:
train_dataset = ResponseGenerationDataset(dataset_args, tokenizer, split_type="train")
eval_dataset = ResponseGenerationDataset(dataset_args, tokenizer, split_type="val")

HBox(children=(FloatProgress(value=0.0, max=72518.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=72518.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




## Train

### Train and eval function

In [12]:
def train(args, train_dataset, eval_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, run_batch_fn_train, run_batch_fn_eval) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        log_dir = os.path.join("outputs", args.exp_name) if args.exp_name else "outputs"
        tb_writer = SummaryWriter(log_dir)
        args.output_dir = log_dir


    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=train_dataset.collate_fn
    )

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    global_step = 0
    model.zero_grad()
    train_iterator = trange(
        0, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # for reproducibility

    for _ in train_iterator:
        local_steps = 0
        tr_loss = 0.0
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            loss, _, _, _ = run_batch_fn_train(args, model, batch)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                local_steps += 1
                epoch_iterator.set_postfix(Loss=tr_loss/local_steps)

        results = evaluate(args, eval_dataset, model, tokenizer, run_batch_fn_eval, desc=str(global_step))
        if args.local_rank in [-1, 0]:
            for key, value in results.items():
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
            tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
            tb_writer.add_scalar("loss", tr_loss / local_steps, global_step)

            checkpoint_prefix = "checkpoint"
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training

            logger.info("Saving model checkpoint to %s", output_dir)
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            with open(os.path.join(output_dir, "params.json"), "w") as jsonfile:
                json.dump(args.params, jsonfile, indent=2, default=lambda x: str(x))
            logger.info("Saving model checkpoint to %s", output_dir)

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / local_steps


In [13]:
def evaluate(args, eval_dataset, model, tokenizer, run_batch_fn, desc="") :
    if args.local_rank in [-1, 0]:
        eval_output_dir = args.output_dir
        os.makedirs(eval_output_dir, exist_ok=True)

    # eval_batch_size for selection must be 1 to handle variable number of candidates
    if args.task == "selection":
        args.eval_batch_size = 1
    else:
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=args.eval_batch_size,
        collate_fn=eval_dataset.collate_fn
    )


    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    data_infos = []
    all_preds = []
    all_labels = []

    for batch in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
        with torch.no_grad():
            loss, lm_logits, mc_logits, mc_labels = run_batch_fn(args, model, batch)
            if args.task == "selection":
                data_infos.append(batch[-1])
            all_preds.append(mc_logits.detach().cpu().numpy())
            all_labels.append(mc_labels.detach().cpu().numpy())
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    if args.task.lower() == "generation":
        perplexity = torch.exp(torch.tensor(eval_loss))
        result = {"perplexity": perplexity, "loss": eval_loss}
    elif args.task.lower() == "selection":
        all_labels = np.array(all_labels).reshape(-1)
        all_pred_ids = np.array([np.argmax(logits) for logits in all_preds])
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        logger.info("Avg. # of candidates: %f", sum([len(arr[0]) for arr in all_preds]) / len(all_preds))
        result = {"loss": eval_loss, "accuracy": accuracy}
        if args.output_file:
            sorted_pred_ids = [np.argsort(logits.squeeze())[::-1] for logits in all_preds]
            write_selection_preds(eval_dataset.dataset_walker, args.output_file, data_infos, sorted_pred_ids, topk=5)
    else:
        raise ValueError("args.task not in ['generation', 'selection', 'detection'], got %s" % args.task)

    if args.local_rank in [-1, 0]:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results %s *****" % desc)
            writer.write("***** Eval results %s *****\n" % desc)
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return result

### Functions to train or evaluate for each batch

In [14]:
def run_batch_generation(args, model, batch):
    batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
    input_ids, token_type_ids, lm_labels = batch
    model_outputs = model(input_ids=input_ids, token_type_ids=None, labels=lm_labels)
    loss = model_outputs[0]
    lm_logits = model_outputs[1]
    return loss, lm_logits, torch.tensor([]), torch.tensor([])

### Start training!

In [15]:
global_step, tr_loss = train(args, train_dataset, eval_dataset, model, tokenizer, 
                              run_batch_generation, run_batch_generation)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4796.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=50.0, style=ProgressStyle(description_wi…




Epoch:  50%|█████     | 1/2 [09:54<09:54, 594.05s/it]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4796.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=50.0, style=ProgressStyle(description_wi…




Epoch: 100%|██████████| 2/2 [19:50<00:00, 595.09s/it]


In [16]:
args.per_gpu_eval_batch_size

4

# Evaluation

In [18]:
args.labels_file = "data/val/labels.json"
args.output_file = "pred/val/generation.json"
args.checkpoint = "./outputs/generation/checkpoint-2398"
args.generation_params_file = "configs/generation_eval_params.json"

In [19]:
print(args.labels_file)
print(args.output_file)
print(args.generation_params_file)

data/val/labels.json
pred/val/generation.json
configs/generation_eval_params.json


In [20]:
with open(args.params_file, "r") as f:
    params = json.load(f)
    args = vars(args)
    update_additional_params(params, args)
    args.update(params)
    if len(args["generation_params_file"]) > 0:
        with open(args["generation_params_file"]) as fg:
            generation_params = json.load(fg)
        args.update(generation_params)
    args = Namespace(**args)

args.params = params # used for saving checkpoints
dataset_args = Namespace(**args.dataset_args)
dataset_args.local_rank = args.local_rank
dataset_args.task = args.task

In [21]:
print(args.labels_file)
print(args.output_file)
print(args.generation_params_file)

data/val/labels.json
pred/val/generation.json
configs/generation_eval_params.json


In [22]:
import torch
import torch.nn.functional as F
import logging

logger = logging.getLogger(__name__)

from utils.metrics import (
    UnigramMetric, NGramDiversity,
    CorpusNGramDiversity,
    BLEU, METEOR, ROUGE
)

from utils.data import write_generation_preds

In [23]:
!pip install nltk
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/jennybae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
args.output_dir

'outputs/generation'

In [25]:
args.output_file

'pred/val/generation.json'

In [26]:
args.labels_file

'data/val/labels.json'

In [27]:
def run_batch_generation_sample(args, model, batch, dataset):
    special_tokens_ids = args.tokenizer.convert_tokens_to_ids(dataset.SPECIAL_TOKENS_VALUES)
    current_output = []

    example = batch[0]
    knowledge, history = example["knowledge"], example["history"]
    response_text = example["response_text"]
    dialog_id = example["dialog_id"]

    for i in range(args.max_length):
        instance, sequence = dataset.build_input_from_segments(
            knowledge, history, current_output, with_eos=False
        )

        input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0)
        token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0)

        model_outputs = model(input_ids=input_ids, token_type_ids=token_type_ids)
        logits = model_outputs[0]

        logits = logits[0, -1, :] / args.temperature
        logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p)
        probs = F.softmax(logits, dim=-1)

        prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
        if i < args.min_length and prev.item() in special_tokens_ids:
            while prev.item() in special_tokens_ids:
                if probs.max().item() == 1:
                    logger.warning("Warning: model generating special token with probability 1! Breaking...")
                    break
                prev = torch.multinomial(probs, num_samples=1)

        if prev.item() in special_tokens_ids:
            break
        current_output.append(prev.item())
    
    return current_output, response_text, dialog_id

def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (..., vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits

In [28]:
def evaluate(args, eval_dataset, model, tokenizer, desc=""):
    if args.local_rank in [-1, 0]:
        eval_output_dir = args.output_dir
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=1, # only support batch_size=1 for sampling right now
        collate_fn=eval_dataset.collate_fn
    )

    metrics = [
        UnigramMetric(metric="recall"),
        UnigramMetric(metric="precision"),
        NGramDiversity(n=1),
        NGramDiversity(n=2),
        NGramDiversity(n=3),
        NGramDiversity(n=4),
        CorpusNGramDiversity(n=1),
        CorpusNGramDiversity(n=2),
        CorpusNGramDiversity(n=3),
        CorpusNGramDiversity(n=4),
        BLEU(),
        METEOR(),
        ROUGE()
    ]

    args.tokenizer = tokenizer
    all_output_texts = []
    dialog_ids = []
    do_evaluate = False
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
        with torch.no_grad():
            sampled_output_ids, ground_truth, dialog_id = run_batch_generation_sample(args, model, batch, eval_dataset)
            sampled_output_text = tokenizer.decode(sampled_output_ids, skip_special_tokens=True)
            all_output_texts.append(sampled_output_text)
            dialog_ids.append(dialog_id)
        if ground_truth.strip() != "":
            do_evaluate = True
            for metric in metrics:
                metric.update((sampled_output_text, ground_truth))

    if args.output_file:
        write_generation_preds(eval_dataset.dataset_walker, args.output_file, dialog_ids, all_output_texts)

    result = dict()
    if do_evaluate and args.local_rank in [-1, 0]:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results %s *****" % desc)
            writer.write("***** Eval results %s *****\n" % desc)
            for metric in metrics:
                name = metric.name()
                score = metric.compute()
                result[name] = score
                logger.info("  %s = %s", name, str(score))
                writer.write("%s = %s\n" % (name, str(score)))

    return result



In [29]:
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint)
model = GPT2LMHeadModel.from_pretrained(args.checkpoint)
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [30]:
eval_dataset = ResponseGenerationEvalDataset(dataset_args, tokenizer, split_type="val", 
                                             labels_file=args.labels_file)
result = evaluate(args, eval_dataset, model, tokenizer, 
                  desc=args.eval_desc or "val")

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=200.0, style=ProgressStyle(description_w…

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()





In [31]:
result

{'UnigramRecall': 0.42050354337099854,
 'UnigramPrecision': 0.43602368513901346,
 '1GramDiversity': 0.9508670903476464,
 '2GramDiversity': 0.9473561393083778,
 '3GramDiversity': 0.8984176590912578,
 '4GramDiversity': 0.8481108636368873,
 'Corpus1GramDiversity': 0.0983125458547322,
 'Corpus2GramDiversity': 0.2619222303741746,
 'Corpus3GramDiversity': 0.3690388848129127,
 'Corpus4GramDiversity': 0.4326241134751773,
 'BLEU': 0.08675389797650139,
 'METEOR': 0.3761632690299168,
 'ROUGE': 0.2895436928846583}

In [35]:
!python scripts/scores.py --dataset val --dataroot data --outfile pred/val/generation.json --scorefile pred/val/generation.score.json

Traceback (most recent call last):
  File "scripts/scores.py", line 1, in <module>
    from utils.dataset_walker import DatasetWalker
ModuleNotFoundError: No module named 'utils'
