In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
from tqdm import tqdm
import torch
import re
import torch.nn as nn
from itertools import islice, zip_longest
from c2nl.eval.bleu import corpus_bleu
from c2nl.eval.rouge import Rouge
from c2nl.eval.meteor import Meteor
from c2nl.eval.distinct_n.distinct_ngrams import distinct_n_corpus_level
from c2nl.eval.self_bleu import self_bleu_score
from c2nl.eval.self_bert import self_bert_score
from datasets import load_dataset, load_from_disk, Dataset
from itertools import islice, zip_longest, chain
from evaluate import load
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer, get_linear_schedule_with_warmup, T5ForConditionalGeneration, LogitsProcessorList, BeamSearchScorer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset
import gumbel as sbsutils
import numpy as np
import random
import json

In [3]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)  # Python random module
    np.random.seed(seed_value)  # Numpy module
    torch.manual_seed(seed_value)  # PyTorch
    torch.cuda.manual_seed(seed_value)  # PyTorch CUDA
    torch.cuda.manual_seed_all(seed_value)  # PyTorch CUDA (for multi-GPU setups)
    torch.backends.cudnn.deterministic = True  # For CUDA backend
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed_value)  # For Python hash seeding
# Example usage
set_seed(42)  # Replace 42 with your desired seed

In [4]:
lang = "python"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
torch.cuda.get_device_name(device)

Using device: cuda


'NVIDIA GeForce RTX 4090'

In [5]:

if lang == 'java':
    base_model ='codet5p_ft_lang_java_backbone'
else:
    base_model = 'codet5p_ft_lang_python_backbone'
if lang == 'java':
    base_model_tokenizer = 'Salesforce/codet5p-220m'
else:
    base_model_tokenizer = 'Salesforce/codet5p-220m-bimodal'
if 'bimodal' in base_model or 'python' in base_model:
    print("using auto model")
    model = AutoModel.from_pretrained(base_model, trust_remote_code=True).to(device)
else:
    print("using t5 conditional generation model")
    model = T5ForConditionalGeneration.from_pretrained(base_model, trust_remote_code=True).to(device)
max_input_length = 512
max_target_length = 100

using auto model


In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model_tokenizer)
train_source_dir = "./data/{}/train/code.original".format(lang)
train_target_dir = "./data/{}/train/javadoc.original".format(lang)
validation_source_dir = "./data/{}/dev/code.original".format(lang)
validation_target_dir = "./data/{}/dev/javadoc.original".format(lang)
test_source_dir = "./data/{}/test/code.original".format(lang)
test_target_dir = "./data/{}/test/javadoc.original".format(lang)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# load my own data

codes = open(train_source_dir, 'r').readlines()
docs = open(train_target_dir, 'r').readlines()
train_inputs = tokenizer(codes, max_length=max_input_length, padding="max_length", truncation=True)
labels = tokenizer(docs, max_length=max_target_length, padding="max_length", truncation=True)
train_inputs["labels"] = labels["input_ids"].copy()
train_inputs["labels"] = [
    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in train_inputs["labels"]
]
train_inputs["labels_attention_mask"] = labels["attention_mask"].copy()

In [8]:
train_data = Dataset.from_dict(train_inputs)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'labels_attention_mask'])
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'labels_attention_mask'],
    num_rows: 55538
})

In [9]:
train_loader = DataLoader(train_data, batch_size=15, shuffle=True)

In [10]:
learning_rate = 5e-5
warmup_steps = 500
num_epochs = 10

total_steps = len(train_loader) * num_epochs

In [11]:
def eval_bleu(model, device, src_dir, tgt_dir, tokenizer):
    model.eval()
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    all_summaries = []
    batch_size = 32
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        source = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = source["input_ids"].to(device)
        attention_mask = source["attention_mask"].to(device)
        generated_ids = model.generate(input_ids,
                                       attention_mask=attention_mask,
                                       max_length=50)
        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_summaries.extend(summaries)
    hypotheses = dict(enumerate([[summary.rstrip().lower()[:-1]+' .'] for summary in all_summaries]))
    references = dict(enumerate([[target.rstrip().lower()] for target in targets]))
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    return bleu
    

In [12]:
# measure model performance before fine-tuning
# eval_bleu(model, device, test_source_dir, test_target_dir, tokenizer)

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [14]:
def train(model, device, train_loader, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Wrap the train_loader with tqdm for a progress bar
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            # Load batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels_attention_mask = batch['labels_attention_mask'].to(device)

            # Forward pass
            model.zero_grad()
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            labels=labels,
                            decoder_attention_mask=labels_attention_mask)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()  # Update the learning rate

            total_loss += loss.item()

            # Update the progress bar with the current loss
            progress_bar.set_postfix({'loss': loss.item()})

        avg_epoch_loss = total_loss / len(train_loader)
        # evaluate model performance every 5 epochs
        if (epoch+1) % 5 == 0:
            bleu = eval_bleu(model, device, validation_source_dir, validation_target_dir, tokenizer)
            print("validation BLEU: ", bleu)
        print(f"Epoch {epoch+1} completed. Average Loss: {avg_epoch_loss}")

In [15]:
# Start training
# train(model, device, train_loader, optimizer, scheduler, num_epochs)

In [16]:
# measure model performance after fine-tuning
# eval_bleu(model, device, test_source_dir, test_target_dir, tokenizer)

In [17]:
# Save the model
# model.save_pretrained("codet5p_ft_epoch_{}_lang_{}".format(num_epochs, lang))

In [18]:
class TextDataset(TorchDataset):
    def __init__(self, hypotheses, references):
        self.hypotheses = list(chain.from_iterable(hypotheses.values()))
        self.references = list(chain.from_iterable(references.values()))

    def __len__(self):
        return len(self.hypotheses)

    def __getitem__(self, idx):
        return self.hypotheses[idx], self.references[idx]

In [19]:
def eval_with_my_beam_search(model,
                             device,
                             src_dir,
                             tgt_dir,
                             tokenizer,
                             beam_search,
                             batch_size=16,
                             beam_size=4,
                             num_return_sequences=4,
                             temperature=1.0):
    model.eval()
    bertscore = load("bertscore")
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    source_codes = [code.rstrip() for code in source_codes]
    targets = [target.rstrip() for target in targets]
    separated_hypotheses = []
    all_summaries = []
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        source = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=350)
        input_ids = source["input_ids"].to(device)
        attention_mask = source["attention_mask"].to(device)
        summaries = beam_search(model,
                                input_ids,
                                attention_mask,
                                tokenizer,
                                beam_size,
                                num_return_sequences,
                                temperature=temperature)
        separated_hypotheses.append([re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .') for summary in summaries])
        all_summaries.extend(summaries)
    all_distinct_unigrams_ratio = [distinct_n_corpus_level(preds, n=1) for preds in separated_hypotheses]
    all_distinct_bigrams_ratio = [distinct_n_corpus_level(preds, n=2) for preds in separated_hypotheses]
    all_self_bleu = [self_bleu_score(preds) for preds in separated_hypotheses]
    all_self_bert_precision = []
    all_self_bert_recall = []
    all_self_bert_f1 = []
    for preds in separated_hypotheses:
        bert_precision, bert_recall, bert_f1 = self_bert_score(preds, bertscore)
        all_self_bert_precision.append(bert_precision)
        all_self_bert_recall.append(bert_recall)
        all_self_bert_f1.append(bert_f1)
    average_distinct_unigrams_ratio = np.mean(all_distinct_unigrams_ratio)
    average_distinct_bigrams_ratio = np.mean(all_distinct_bigrams_ratio)
    average_self_bleu = np.mean(all_self_bleu)
    average_self_bert_precision = np.mean(all_self_bert_precision)
    average_self_bert_recall = np.mean(all_self_bert_recall)
    average_self_bert_f1 = np.mean(all_self_bert_f1)
    hypotheses = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .')] for summary in all_summaries]))
    # repeat targets for each generated sequence
    repeated_targets = []
    for target in targets:
        repeated_targets.extend([target]*num_return_sequences)
    references = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", target.strip().lower())] for target in repeated_targets]))
    #calculate oracle scores
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    reshaped_bleu = np.array(list(ind_bleu.values())).reshape(-1, num_return_sequences)
    oracle_bleu = np.max(reshaped_bleu, axis=1)
    rouge_calculator = Rouge()
    rouge_l, ind_rouge = rouge_calculator.compute_score(references, hypotheses)
    reshaped_rouge = np.array(list(ind_rouge.values())).reshape(-1, num_return_sequences)
    oracle_rouge = np.max(reshaped_rouge, axis=1)
    meteor_calculator = Meteor()
    meteor, ind_meteor = meteor_calculator.compute_score(references, hypotheses)
    reshaped_meteor = np.array(list(ind_meteor)).reshape(-1, num_return_sequences)
    oracle_meteor = np.max(reshaped_meteor, axis=1)
    print("Oracle scores, bleu: ", np.mean(oracle_bleu) * 100, " rouge-l: ", np.mean(oracle_rouge) * 100, " meteor: ", np.mean(oracle_meteor) * 100)

    bert_precision = []
    bert_recall = []
    bert_f1 = []
    # Define your batch size
    batch_size = 5120
    # Create an instance of your dataset
    dataset = TextDataset(hypotheses, references)
    # Create a DataLoader instance
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    # Iterating over batches and compute BERTScore for each with a progress bar
    for hypotheses_batch, references_batch in tqdm(dataloader, desc="Processing batches"):
        batch_results = bertscore.compute(predictions=hypotheses_batch,
                                          references=references_batch,
                                          lang="en")
        bert_precision.extend(list(batch_results['precision']))
        bert_recall.extend(list(batch_results['recall']))
        bert_f1.extend(list(batch_results['f1']))
    
    bert_precision_array = np.array(bert_precision).reshape(-1, num_return_sequences)
    best_bert_precision = np.max(bert_precision_array, axis=1)
    
    bert_recall_array = np.array(bert_recall).reshape(-1, num_return_sequences)
    best_bert_recall = np.max(bert_recall_array, axis=1)
    
    bert_f1_array = np.array(bert_f1).reshape(-1, num_return_sequences)
    best_bert_f1 = np.max(bert_f1_array, axis=1)
    
    # write results
    # create the file if it does not exist
    hyp_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"stochastic_beam_search_results_beam_{num_return_sequences}_temperature_{temperature}_hypotheses.json"), 'w')
    json.dump(hypotheses, hyp_fw, indent=4)
    hyp_fw.close()
    
    ref_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"stochastic_beam_search_results_beam_{num_return_sequences}_temperature_{temperature}_references.json"), 'w')
    json.dump(references, ref_fw, indent=4)
    ref_fw.close()
    
    file_name = "stochastic_beam_search_results.txt"
    with open(os.path.join("diverse_decoding_results", file_name), "a") as f:
        f.write(f"temperature: {temperature}, num_return_sequences: {num_return_sequences}, distinct_unigrams_ratio: {average_distinct_unigrams_ratio * 100: .4f}, distinct_bigrams_ratio: {average_distinct_bigrams_ratio * 100: .4f}, self_bleu: {average_self_bleu * 100: .4f}, self_bert_precision: {average_self_bert_precision * 100: .4f}, self_bert_recall: {average_self_bert_recall * 100: .4f}, self_bert_f1: {average_self_bert_f1 * 100: .4f}, average_bleu: {np.mean(list(ind_bleu.values())) * 100: .4f}, average_rouge: {np.mean(list(ind_rouge.values())) * 100: .4f}, average_meteor: {np.mean(list(ind_meteor)) * 100: .4f}, oracle_bleu: {np.mean(oracle_bleu) * 100: .4f}, oracle_rouge: {np.mean(oracle_rouge) * 100: .4f}, oracle_meteor: {np.mean(oracle_meteor) * 100: .4f}, oracle_bert_precision: {np.mean(best_bert_precision) * 100: .4f}, oracle_bert_recall: {np.mean(best_bert_recall) * 100: .4f}, oracle_bert_f1: {np.mean(best_bert_f1) * 100: .4f}\n")
    
    return hypotheses, references

In [20]:
def my_beam_search(model,
                   encoder_input_ids,
                   encoder_attention_mask,
                   tokenizer,
                   num_beams=4,
                   num_return_seq=4,
                   max_length=70):
    model.eval()
    beam_scorer = BeamSearchScorer(batch_size=encoder_input_ids.size(0),
                                   num_beams=num_beams,
                                   device=encoder_input_ids.device,
                                   num_beam_hyps_to_keep=num_return_seq,
                                   max_length=max_length,)
    logits_processor = LogitsProcessorList()
    pad_token_id = tokenizer.pad_token_id
    eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
    if isinstance(eos_token_id, int):
        eos_token_id = [eos_token_id]
    decoder_input_ids = torch.full((encoder_input_ids.size(0), 1), pad_token_id, dtype=torch.long, device=encoder_input_ids.device)

    expanded_encoder_input_ids = encoder_input_ids.repeat_interleave(num_beams, dim=0)
    expanded_encoder_attention_mask = encoder_attention_mask.repeat_interleave(num_beams, dim=0)
    expanded_decoder_input_ids = decoder_input_ids.repeat_interleave(num_beams, dim=0)

    batch_size = encoder_input_ids.size(0)
    batch_beam_size, cur_len = expanded_decoder_input_ids.shape

    assert num_beams * batch_size == batch_beam_size

    scores = ()
    raw_logits = ()
    beam_indices = (tuple(() for _ in range(batch_beam_size)))

    beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=expanded_decoder_input_ids.device)
    beam_scores[:, 1:] = -1e9
    beam_scores = beam_scores.view((batch_size * num_beams,))

    decoder_prompt_len = expanded_decoder_input_ids.shape[-1]
    encoder_output = model.get_encoder()(expanded_encoder_input_ids, expanded_encoder_attention_mask)

    while cur_len < max_length:
        with torch.no_grad():
            outputs = model(encoder_outputs=encoder_output,
                            decoder_input_ids=expanded_decoder_input_ids)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_scores = nn.functional.log_softmax(
            next_token_logits, dim=-1
        )  # (batch_size * num_beams, vocab_size)
        next_token_scores_processed = logits_processor(expanded_decoder_input_ids, next_token_scores)
        next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
            next_token_scores_processed
        )
        vocab_size = next_token_scores.shape[-1]
        next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
        # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
        n_eos_tokens = len(eos_token_id) if eos_token_id else 0
        next_token_scores, next_tokens = torch.topk(
            next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
        )
        next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
        next_tokens = next_tokens % vocab_size
        scores += (next_token_scores_processed,)
        raw_logits += (next_token_logits,)
        # stateless
        beam_outputs = beam_scorer.process(
            expanded_decoder_input_ids,
            next_token_scores,
            next_tokens,
            next_indices,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            beam_indices=beam_indices,
            decoder_prompt_len=decoder_prompt_len,
        )

        beam_scores = beam_outputs["next_beam_scores"]
        beam_next_tokens = beam_outputs["next_beam_tokens"]
        beam_idx = beam_outputs["next_beam_indices"]
        expanded_decoder_input_ids = torch.cat([expanded_decoder_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
        cur_len = cur_len + 1
        if beam_scorer.is_done:
            break
    sequence_outputs = beam_scorer.finalize(
        expanded_decoder_input_ids,
        beam_scores,
        next_tokens,
        next_indices,
        pad_token_id=pad_token_id,
        eos_token_id=eos_token_id,
        max_length=max_length,
        beam_indices=beam_indices,
        decoder_prompt_len=decoder_prompt_len,
    )
    decoded = tokenizer.batch_decode(sequence_outputs["sequences"], skip_special_tokens=True)
    return decoded

In [21]:
def stochastic_beam_search(model,
                           encoder_input_ids,
                           encoder_attention_mask,
                           tokenizer,
                           num_beams=4,
                           num_return_seq=4,
                           max_length=60,
                           temperature=1.,):
    model.eval()
    beam_scorer = BeamSearchScorer(batch_size=encoder_input_ids.size(0),
                                   num_beams=num_beams,
                                   device=encoder_input_ids.device,
                                   num_beam_hyps_to_keep=num_return_seq,
                                   max_length=max_length,)
    logits_processor = LogitsProcessorList()
    pad_token_id = tokenizer.pad_token_id
    eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
    if isinstance(eos_token_id, int):
        eos_token_id = [eos_token_id]
    decoder_input_ids = torch.full((encoder_input_ids.size(0), 1), pad_token_id, dtype=torch.long, device=encoder_input_ids.device)

    expanded_encoder_input_ids = encoder_input_ids.repeat_interleave(num_beams, dim=0)
    expanded_encoder_attention_mask = encoder_attention_mask.repeat_interleave(num_beams, dim=0)
    expanded_decoder_input_ids = decoder_input_ids.repeat_interleave(num_beams, dim=0)

    batch_size = encoder_input_ids.size(0)
    batch_beam_size, cur_len = expanded_decoder_input_ids.shape

    assert num_beams * batch_size == batch_beam_size

    scores = ()
    raw_logits = ()
    beam_indices = (tuple(() for _ in range(batch_beam_size)))

    beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=expanded_decoder_input_ids.device)
    beam_scores[:, 1:] = -1e9
    beam_scores = beam_scores.view((batch_size * num_beams,))
    gumbel_scores = torch.zeros((batch_size * num_beams, max_length), dtype=torch.float, device=expanded_decoder_input_ids.device)

    decoder_prompt_len = expanded_decoder_input_ids.shape[-1]
    encoder_output = model.get_encoder()(expanded_encoder_input_ids, expanded_encoder_attention_mask)

    while cur_len < max_length:
        with torch.no_grad():
            outputs = model(encoder_outputs=encoder_output,
                            decoder_input_ids=expanded_decoder_input_ids)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_scores = nn.functional.log_softmax(
            next_token_logits / temperature, dim=-1
        )  # (batch_size * num_beams, vocab_size)
        next_token_scores_processed = logits_processor(expanded_decoder_input_ids, next_token_scores)
        next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
            next_token_scores_processed
        )
        if cur_len == 1:
            cand_scores = sbsutils.gumbel_like(next_token_scores) + next_token_scores
        else:
            cand_scores, _ = sbsutils.gumbel_with_maximum(next_token_scores.view(batch_size, num_beams, -1), gumbel_scores.view(batch_size, num_beams, -1)[:, :, cur_len-1], -1)
            cand_scores = cand_scores.view(batch_size * num_beams, -1)
        next_token_scores = cand_scores
        vocab_size = next_token_scores.shape[-1]
        next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
        # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
        n_eos_tokens = len(eos_token_id) if eos_token_id else 0
        next_token_scores, next_tokens = torch.topk(
            next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
        )
        next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
        next_tokens = next_tokens % vocab_size
        scores += (next_token_scores_processed,)
        raw_logits += (next_token_logits,)
        # stateless
        beam_outputs = beam_scorer.process(
            expanded_decoder_input_ids,
            next_token_scores,
            next_tokens,
            next_indices,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            beam_indices=beam_indices,
            decoder_prompt_len=decoder_prompt_len,
        )

        beam_scores = beam_outputs["next_beam_scores"]
        beam_next_tokens = beam_outputs["next_beam_tokens"]
        beam_idx = beam_outputs["next_beam_indices"]
        gumbel_scores[:, cur_len] = beam_scores
        expanded_decoder_input_ids = torch.cat([expanded_decoder_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
        cur_len = cur_len + 1
        if beam_scorer.is_done:
            break
    sequence_outputs = beam_scorer.finalize(
        expanded_decoder_input_ids,
        beam_scores,
        next_tokens,
        next_indices,
        pad_token_id=pad_token_id,
        eos_token_id=eos_token_id,
        max_length=max_length,
        beam_indices=beam_indices,
        decoder_prompt_len=decoder_prompt_len,
    )
    decoded = [tokenizer.decode(output, skip_special_tokens=True) for output in sequence_outputs["sequences"]]
    return decoded

In [22]:
# num_return_sequences_list = [10, 20]
# temperature_list = [0.3]
# for temperature in temperature_list:
#     for num_return_sequences in num_return_sequences_list:
#         print("temperature: ", temperature, " num_return_sequences: ", num_return_sequences)
#         hypotheses, references = eval_with_my_beam_search(model,
#                                                           device,
#                                                           test_source_dir,
#                                                           test_target_dir,
#                                                           tokenizer,
#                                                           stochastic_beam_search,
#                                                           batch_size=1,
#                                                           beam_size=num_return_sequences,
#                                                           num_return_sequences=num_return_sequences,
#                                                           temperature=temperature)

In [23]:
def distinct_with_beam_search(model,
                              device,
                              src_dir,
                              tgt_dir,
                              tokenizer,
                              batch_size=16,
                              beam_size=10,
                              num_return_sequences=8):
    model.eval()
    bertscore = load("bertscore")
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    source_codes = [code.rstrip() for code in source_codes]
    targets = [target.rstrip() for target in targets]
    separated_hypotheses = []
    all_summaries = []
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        source = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = source["input_ids"].to(device)
        attention_mask = source["attention_mask"].to(device)
        generated_ids = model.generate(input_ids, 
                                       attention_mask=attention_mask,
                                       max_length=100, 
                                       num_beams=beam_size, 
                                       num_return_sequences=num_return_sequences)
        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        separated_hypotheses.append([re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .') for summary in summaries])
        all_summaries.extend(summaries)
    all_distinct_unigrams_ratio = [distinct_n_corpus_level(preds, n=1) for preds in separated_hypotheses]
    all_distinct_bigrams_ratio = [distinct_n_corpus_level(preds, n=2) for preds in separated_hypotheses]
    all_self_bleu = [self_bleu_score(preds) for preds in separated_hypotheses]
    all_self_bert_precision = []
    all_self_bert_recall = []
    all_self_bert_f1 = []
    for preds in separated_hypotheses:
        bert_precision, bert_recall, bert_f1 = self_bert_score(preds, bertscore)
        all_self_bert_precision.append(bert_precision)
        all_self_bert_recall.append(bert_recall)
        all_self_bert_f1.append(bert_f1)
    average_distinct_unigrams_ratio = np.mean(all_distinct_unigrams_ratio)
    average_distinct_bigrams_ratio = np.mean(all_distinct_bigrams_ratio)
    average_self_bleu = np.mean(all_self_bleu)
    average_self_bert_precision = np.mean(all_self_bert_precision)
    average_self_bert_recall = np.mean(all_self_bert_recall)
    average_self_bert_f1 = np.mean(all_self_bert_f1)
    hypotheses = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .')] for summary in all_summaries]))
    # repeat targets for each generated sequence
    repeated_targets = []
    for target in targets:
        repeated_targets.extend([target]*num_return_sequences)
    references = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", target.strip().lower())] for target in repeated_targets]))
    #calculate oracle scores
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    reshaped_bleu = np.array(list(ind_bleu.values())).reshape(-1, num_return_sequences)
    oracle_bleu = np.max(reshaped_bleu, axis=1)
    rouge_calculator = Rouge()
    rouge_l, ind_rouge = rouge_calculator.compute_score(references, hypotheses)
    reshaped_rouge = np.array(list(ind_rouge.values())).reshape(-1, num_return_sequences)
    oracle_rouge = np.max(reshaped_rouge, axis=1)
    meteor_calculator = Meteor()
    meteor, ind_meteor = meteor_calculator.compute_score(references, hypotheses)
    reshaped_meteor = np.array(list(ind_meteor)).reshape(-1, num_return_sequences)
    oracle_meteor = np.max(reshaped_meteor, axis=1)
    print("Oracle scores, bleu: ", np.mean(oracle_bleu) * 100, " rouge-l: ", np.mean(oracle_rouge) * 100, " meteor: ", np.mean(oracle_meteor) * 100)

    bert_precision = []
    bert_recall = []
    bert_f1 = []
    # Define your batch size
    batch_size = 5120
    # Create an instance of your dataset
    dataset = TextDataset(hypotheses, references)
    # Create a DataLoader instance
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    # Iterating over batches and compute BERTScore for each with a progress bar
    for hypotheses_batch, references_batch in tqdm(dataloader, desc="Processing batches"):
        batch_results = bertscore.compute(predictions=hypotheses_batch,
                                          references=references_batch,
                                          lang="en")
        bert_precision.extend(list(batch_results['precision']))
        bert_recall.extend(list(batch_results['recall']))
        bert_f1.extend(list(batch_results['f1']))

    bert_precision_array = np.array(bert_precision).reshape(-1, num_return_sequences)
    best_bert_precision = np.max(bert_precision_array, axis=1)

    bert_recall_array = np.array(bert_recall).reshape(-1, num_return_sequences)
    best_bert_recall = np.max(bert_recall_array, axis=1)

    bert_f1_array = np.array(bert_f1).reshape(-1, num_return_sequences)
    best_bert_f1 = np.max(bert_f1_array, axis=1)

    hyp_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"beam_search_results_beam_{num_return_sequences}_hypotheses.json"), 'w')
    json.dump(hypotheses, hyp_fw, indent=4)
    hyp_fw.close()

    ref_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"beam_search_results_beam_{num_return_sequences}_references.json"), 'w')
    json.dump(references, ref_fw, indent=4)
    ref_fw.close()
    
    file_name = "beam_search_results.txt"
    with open(os.path.join("diverse_decoding_results", file_name), "a") as f:
        f.write(f"num_return_sequences: {num_return_sequences}, distinct_unigrams_ratio: {average_distinct_unigrams_ratio * 100: .4f}, distinct_bigrams_ratio: {average_distinct_bigrams_ratio * 100: .4f}, self_bleu: {average_self_bleu * 100: .4f}, self_bert_precision: {average_self_bert_precision * 100: .4f}, self_bert_recall: {average_self_bert_recall * 100: .4f}, self_bert_f1: {average_self_bert_f1 * 100: .4f}, average_bleu: {np.mean(list(ind_bleu.values())) * 100: .4f}, average_rouge: {np.mean(list(ind_rouge.values())) * 100: .4f}, average_meteor: {np.mean(list(ind_meteor)) * 100: .4f}, oracle_bleu: {np.mean(oracle_bleu) * 100: .4f}, oracle_rouge: {np.mean(oracle_rouge) * 100: .4f}, oracle_meteor: {np.mean(oracle_meteor) * 100: .4f}, oracle_bert_precision: {np.mean(best_bert_precision) * 100: .4f}, oracle_bert_recall: {np.mean(best_bert_recall) * 100: .4f}, oracle_bert_f1: {np.mean(best_bert_f1) * 100: .4f}\n")
                
    return hypotheses, references

In [None]:
num_return_sequences_list = [10, 20]
for num_return_sequences in num_return_sequences_list:
    hypotheses, references = distinct_with_beam_search(model,
                                                       device,
                                                       test_source_dir,
                                                       test_target_dir,
                                                       tokenizer,
                                                       batch_size=4,
                                                       beam_size=num_return_sequences,
                                                       num_return_sequences=num_return_sequences)

100%|██████████| 4626/4626 [26:50<00:00,  2.87it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Oracle scores, bleu:  45.000365738693944  rouge-l:  61.78330220564606  meteor:  42.34021246836098


Processing batches: 100%|██████████| 37/37 [02:20<00:00,  3.79s/it]
100%|██████████| 4626/4626 [42:25<00:00,  1.82it/s] 
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def distinct_with_sampling(model,
                           device,
                           src_dir,
                           tgt_dir,
                           tokenizer,
                           batch_size=1,
                           num_sample=100,
                           temperature=1.0,
                           top_k=50,
                           top_p=1,
                           num_distinct_summary_list=(4, 8, 12, 16, 20)):
    model.eval()
    bertscore = load("bertscore")
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    source_codes = [code.rstrip() for code in source_codes]
    targets = [target.rstrip() for target in targets]
    all_summaries = []
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        source = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = source["input_ids"].to(device)
        attention_mask = source["attention_mask"].to(device)
        generated_ids = model.generate(input_ids,
                                       attention_mask=attention_mask,
                                       max_length=80,
                                       do_sample=True,
                                       # top_k=top_k,
                                       # top_p=top_p,
                                       temperature=temperature,
                                       num_return_sequences=num_sample)
        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_summaries.extend(summaries)

    # separate the summaries into nested list of num_latents
    separated_summaries = [all_summaries[i:i+num_sample] for i in range(0, len(all_summaries), num_sample)]
    predictions = []
    for num_distinct_summary in num_distinct_summary_list:
        # keep at most num_distinct_summary distinct summaries
        distinct_summaries = []
        distinct_guesses = []
        repeated_targets = []
        separated_hypotheses = []
        for i, summaries in enumerate(separated_summaries):
            distinct_sum = list(set(summaries))
            if len(distinct_sum) > num_distinct_summary:
                distinct_sum = distinct_sum[:num_distinct_summary]
            distinct_guesses.append(len(distinct_sum))
            separated_hypotheses.append([re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .') for summary in distinct_sum])
            distinct_summaries.extend(distinct_sum)
            repeated_targets.extend([targets[i]]*len(distinct_sum))
        predictions.append(separated_hypotheses)
        all_distinct_unigrams_ratio = [distinct_n_corpus_level(preds, n=1) for preds in separated_hypotheses]
        all_distinct_bigrams_ratio = [distinct_n_corpus_level(preds, n=2) for preds in separated_hypotheses]
        all_self_bleu = [self_bleu_score(preds) for preds in separated_hypotheses]
        all_self_bert_precision = []
        all_self_bert_recall = []
        all_self_bert_f1 = []
        for preds in separated_hypotheses:
            bert_precision, bert_recall, bert_f1 = self_bert_score(preds, bertscore)
            all_self_bert_precision.append(bert_precision)
            all_self_bert_recall.append(bert_recall)
            all_self_bert_f1.append(bert_f1)
        average_distinct_unigrams_ratio = np.mean(all_distinct_unigrams_ratio)
        average_distinct_bigrams_ratio = np.mean(all_distinct_bigrams_ratio)
        average_self_bleu = np.mean(all_self_bleu)
        average_self_bert_precision = np.mean(all_self_bert_precision)
        average_self_bert_recall = np.mean(all_self_bert_recall)
        average_self_bert_f1 = np.mean(all_self_bert_f1)
        hypotheses = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .')] for summary in distinct_summaries]))
        references = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", target.strip().lower())] for target in repeated_targets]))
        #calculate oracle scores
        print("average distinct guesses: ", np.mean(distinct_guesses))
        _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
        ind_bleu_input = iter(list(ind_bleu.values()))
        sliced_bleu = [list(islice(ind_bleu_input, elem))
                       for elem in distinct_guesses]
        np_sliced_bleu = np.array(list(zip_longest(*sliced_bleu, fillvalue=0))).T
        oracle_bleu = np.max(np_sliced_bleu, axis=1)

        rouge_calculator = Rouge()
        rouge_l, ind_rouge = rouge_calculator.compute_score(references, hypotheses)
        ind_rouge_input = iter(list(ind_rouge.values()))
        sliced_rouge = [list(islice(ind_rouge_input, elem))
                        for elem in distinct_guesses]
        np_sliced_rouge = np.array(list(zip_longest(*sliced_rouge, fillvalue=0))).T
        oracle_rouge = np.max(np_sliced_rouge, axis=1)

        meteor_calculator = Meteor()
        meteor, ind_meteor = meteor_calculator.compute_score(references, hypotheses)
        ind_meteor_input = iter(list(ind_meteor))
        sliced_meteor = [list(islice(ind_meteor_input, elem))
                         for elem in distinct_guesses]
        np_sliced_meteor = np.array(list(zip_longest(*sliced_meteor, fillvalue=0))).T
        oracle_meteor = np.max(np_sliced_meteor, axis=1)
        print("Oracle scores, bleu: ", np.mean(oracle_bleu) * 100, " rouge-l: ", np.mean(oracle_rouge) * 100, " meteor: ", np.mean(oracle_meteor) * 100)


        bert_precision = []
        bert_recall = []
        bert_f1 = []
        # Define your batch size
        batch_size = 5120
        # Create an instance of your dataset
        dataset = TextDataset(hypotheses, references)
        # Create a DataLoader instance
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        # Iterating over batches and compute BERTScore for each with a progress bar
        for hypotheses_batch, references_batch in tqdm(dataloader, desc="Processing batches"):
            batch_results = bertscore.compute(predictions=hypotheses_batch,
                                              references=references_batch,
                                              lang="en")
            bert_precision.extend(list(batch_results['precision']))
            bert_recall.extend(list(batch_results['recall']))
            bert_f1.extend(list(batch_results['f1']))

        bert_precision_input = iter(bert_precision)
        sliced_bert_precision = [list(islice(bert_precision_input, elem))
                                 for elem in distinct_guesses]
        np_sliced_bert_precision = np.array(list(zip_longest(*sliced_bert_precision, fillvalue=0))).T
        best_bert_precision = np.max(np_sliced_bert_precision, axis=1)

        bert_recall_input = iter(bert_recall)
        sliced_bert_recall = [list(islice(bert_recall_input, elem))
                              for elem in distinct_guesses]
        np_sliced_bert_recall = np.array(list(zip_longest(*sliced_bert_recall, fillvalue=0))).T
        best_bert_recall = np.max(np_sliced_bert_recall, axis=1)

        bert_f1_input = iter(bert_f1)
        sliced_bert_f1 = [list(islice(bert_f1_input, elem))
                          for elem in distinct_guesses]
        np_sliced_bert_f1 = np.array(list(zip_longest(*sliced_bert_f1, fillvalue=0))).T
        best_bert_f1 = np.max(np_sliced_bert_f1, axis=1)
        
        hyp_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"sampling_results_temperature_{temperature}_num_distinct_summary_{num_distinct_summary}_hypotheses.json"), 'w')
        json.dump(hypotheses, hyp_fw, indent=4)
        hyp_fw.close()
        
        ref_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"sampling_results_temperature_{temperature}_num_distinct_summary_{num_distinct_summary}_references.json"), 'w')
        json.dump(references, ref_fw, indent=4)
        ref_fw.close()
        
        file_name = "sampling_results.txt"
        with open(os.path.join("diverse_decoding_results", file_name), "a") as f:
            f.write(f"temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, num_distinct_summary: {num_distinct_summary}, distinct_guesses: {np.mean(distinct_guesses)}, distinct_unigrams_ratio: {average_distinct_unigrams_ratio * 100: .4f}, distinct_bigrams_ratio: {average_distinct_bigrams_ratio * 100: .4f}, self_bleu: {average_self_bleu * 100: .4f}, self_bert_precision: {average_self_bert_precision * 100: .4f}, self_bert_recall: {average_self_bert_recall * 100: .4f}, self_bert_f1: {average_self_bert_f1 * 100: .4f}, average_bleu: {np.mean(list(ind_bleu.values())) * 100: .4f}, average_rouge: {np.mean(list(ind_rouge.values())) * 100: .4f}, average_meteor: {np.mean(list(ind_meteor)) * 100: .4f}, oracle_bleu: {np.mean(oracle_bleu) * 100: .4f}, oracle_rouge: {np.mean(oracle_rouge) * 100: .4f}, oracle_meteor: {np.mean(oracle_meteor) * 100: .4f}, oracle_bert_precision: {np.mean(best_bert_precision) * 100: .4f}, oracle_bert_recall: {np.mean(best_bert_recall) * 100: .4f}, oracle_bert_f1: {np.mean(best_bert_f1) * 100: .4f}\n")
            
    return predictions


In [None]:
for temperature in [1.0]:
    hypotheses = distinct_with_sampling(model,
                                        device,
                                        test_source_dir,
                                        test_target_dir,
                                        tokenizer,
                                        batch_size=1,
                                        num_sample=100,
                                        temperature=temperature,
                                        # top_k=50,
                                        # top_p=1,
                                        num_distinct_summary_list=(10, 20))

In [None]:
def distinct_with_diverse_beam_search(model,
                                      device,
                                      src_dir,
                                      tgt_dir,
                                      tokenizer,
                                      batch_size=16,
                                      beam_size=10,
                                      num_return_sequences=8,
                                      diversity_penalty=10.0):
    model.eval()
    bertscore = load("bertscore")
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    source_codes = [code.rstrip() for code in source_codes]
    targets = [target.rstrip() for target in targets]
    separated_hypotheses = []
    all_summaries = []
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        source = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = source["input_ids"].to(device)
        attention_mask = source["attention_mask"].to(device)
        generated_ids = model.generate(input_ids,
                                       attention_mask=attention_mask,
                                       max_length=100,
                                       diversity_penalty=diversity_penalty,
                                       num_beam_groups=2,
                                       num_beams=beam_size,
                                       num_return_sequences=num_return_sequences)
        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        separated_hypotheses.append([re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .') for summary in summaries])
        all_summaries.extend(summaries)
    all_distinct_unigrams_ratio = [distinct_n_corpus_level(preds, n=1) for preds in separated_hypotheses]
    all_distinct_bigrams_ratio = [distinct_n_corpus_level(preds, n=2) for preds in separated_hypotheses]
    all_self_bleu = [self_bleu_score(preds) for preds in separated_hypotheses]
    all_self_bert_precision = []
    all_self_bert_recall = []
    all_self_bert_f1 = []
    for preds in separated_hypotheses:
        bert_precision, bert_recall, bert_f1 = self_bert_score(preds, bertscore)
        all_self_bert_precision.append(bert_precision)
        all_self_bert_recall.append(bert_recall)
        all_self_bert_f1.append(bert_f1)
    average_distinct_unigrams_ratio = np.mean(all_distinct_unigrams_ratio)
    average_distinct_bigrams_ratio = np.mean(all_distinct_bigrams_ratio)
    average_self_bleu = np.mean(all_self_bleu)
    average_self_bert_precision = np.mean(all_self_bert_precision)
    average_self_bert_recall = np.mean(all_self_bert_recall)
    average_self_bert_f1 = np.mean(all_self_bert_f1)
    hypotheses = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .')] for summary in all_summaries]))
    # repeat targets for each generated sequence
    repeated_targets = []
    for target in targets:
        repeated_targets.extend([target]*num_return_sequences)
    references = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", target.strip().lower())] for target in repeated_targets]))
    #calculate oracle scores
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    reshaped_bleu = np.array(list(ind_bleu.values())).reshape(-1, num_return_sequences)
    oracle_bleu = np.max(reshaped_bleu, axis=1)
    rouge_calculator = Rouge()
    rouge_l, ind_rouge = rouge_calculator.compute_score(references, hypotheses)
    reshaped_rouge = np.array(list(ind_rouge.values())).reshape(-1, num_return_sequences)
    oracle_rouge = np.max(reshaped_rouge, axis=1)
    meteor_calculator = Meteor()
    meteor, ind_meteor = meteor_calculator.compute_score(references, hypotheses)
    reshaped_meteor = np.array(list(ind_meteor)).reshape(-1, num_return_sequences)
    oracle_meteor = np.max(reshaped_meteor, axis=1)
    print("Oracle scores, bleu: ", np.mean(oracle_bleu) * 100, " rouge-l: ", np.mean(oracle_rouge) * 100, " meteor: ", np.mean(oracle_meteor) * 100)

    
    bert_precision = []
    bert_recall = []
    bert_f1 = []
    # Define your batch size
    batch_size = 5120
    # Create an instance of your dataset
    dataset = TextDataset(hypotheses, references)
    # Create a DataLoader instance
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    # Iterating over batches and compute BERTScore for each with a progress bar
    for hypotheses_batch, references_batch in tqdm(dataloader, desc="Processing batches"):
        batch_results = bertscore.compute(predictions=hypotheses_batch,
                                          references=references_batch,
                                          lang="en")
        bert_precision.extend(list(batch_results['precision']))
        bert_recall.extend(list(batch_results['recall']))
        bert_f1.extend(list(batch_results['f1']))

    bert_precision_array = np.array(bert_precision).reshape(-1, num_return_sequences)
    best_bert_precision = np.max(bert_precision_array, axis=1)

    bert_recall_array = np.array(bert_recall).reshape(-1, num_return_sequences)
    best_bert_recall = np.max(bert_recall_array, axis=1)

    bert_f1_array = np.array(bert_f1).reshape(-1, num_return_sequences)
    best_bert_f1 = np.max(bert_f1_array, axis=1)
    
    hyp_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"diverse_beam_search_results_beam_{num_return_sequences}_diversity_penalty_{diversity_penalty}_num_beam_groups_{2}_hypotheses.json"), 'w')
    json.dump(hypotheses, hyp_fw, indent=4)
    hyp_fw.close()
    
    ref_fw = open(os.path.join("diverse_decoding_results", "generated", lang, f"diverse_beam_search_results_beam_{num_return_sequences}_diversity_penalty_{diversity_penalty}_num_beam_groups_{2}_references.json"), 'w')
    json.dump(references, ref_fw, indent=4)
    ref_fw.close()
    
    file_name = "diverse_beam_search_results.txt"
    
    with open(os.path.join("diverse_decoding_results", file_name), "a") as f:
        f.write(f"num_return_sequences: {num_return_sequences}, num_beam_groups: {2}, beam_size: {beam_size}, diversity_penalty: {diversity_penalty}, distinct_unigrams_ratio: {average_distinct_unigrams_ratio * 100: .4f}, distinct_bigrams_ratio: {average_distinct_bigrams_ratio * 100: .4f}, self_bleu: {average_self_bleu * 100: .4f}, self_bert_precision: {average_self_bert_precision * 100: .4f}, self_bert_recall: {average_self_bert_recall * 100: .4f}, self_bert_f1: {average_self_bert_f1 * 100: .4f}, average_bleu: {np.mean(list(ind_bleu.values())) * 100: .4f}, average_rouge: {np.mean(list(ind_rouge.values())) * 100: .4f}, average_meteor: {np.mean(list(ind_meteor)) * 100: .4f}, oracle_bleu: {np.mean(oracle_bleu) * 100: .4f}, oracle_rouge: {np.mean(oracle_rouge) * 100: .4f}, oracle_meteor: {np.mean(oracle_meteor) * 100: .4f}, oracle_bert_precision: {np.mean(best_bert_precision) * 100: .4f}, oracle_bert_recall: {np.mean(best_bert_recall) * 100: .4f}, oracle_bert_f1: {np.mean(best_bert_f1) * 100: .4f}\n")

    return hypotheses, references

In [None]:
diversity_penalty_list = [25.]
num_return_sequences_list = [10, 20]
for diversity_penalty in diversity_penalty_list:
    for num_return_sequences in num_return_sequences_list:
        print("diversity penalty: ", diversity_penalty, " num_return_sequences: ", num_return_sequences)
        hypotheses, references = distinct_with_diverse_beam_search(model,
                                                                   device,
                                                                   test_source_dir,
                                                                   test_target_dir,
                                                                   tokenizer,
                                                                   batch_size=1,
                                                                   beam_size=num_return_sequences,
                                                                   num_return_sequences=num_return_sequences,
                                                                   diversity_penalty=diversity_penalty)