In [None]:
pip install huggingface

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1


In [None]:
pip install datasets transformers evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Define the data directory inside your Google Drive
# data_dir = "/content/drive/My Drive/Colab Notebooks/corpora"
data_dir = "/content/drive/My Drive/266 Data Project/corpora"

Mounted at /content/drive


In [None]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset, load_dataset
import torch.nn.functional as F
import os
import json
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

def load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens):
    # Load tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(special_tokens))
    })

    # Load base model
    model = MarianMTModel.from_pretrained(model_name)

    # Create custom model, CustomMarianMTModel will create a BiomedicalEncoder object in init()
    custom_model = CustomMarianMTModel(
        config=model.config,
        hidden_size=hidden_size,
        special_token_size=len(special_tokens),
    )


    # Resize token embeddings
    custom_model.resize_token_embeddings(len(tokenizer))

    return tokenizer, custom_model

class BiomedicalEncoder(nn.Module):
    def __init__(self, hidden_size, special_token_size):
        super(BiomedicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Adjust the linear layer to match input dimensions
        self.linear = nn.Linear(special_token_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, entity_embeddings):
        # Reshape entity embeddings if necessary
        original_shape = entity_embeddings.shape

        # Flatten the tensor if it has more than 2 dimensions
        if len(original_shape) > 2:
            entity_embeddings = entity_embeddings.view(-1, original_shape[-1])

        # Ensure the input matches the expected dimension
        if entity_embeddings.size(1) != self.special_token_size:
            # If the input doesn't match, pad or truncate
            if entity_embeddings.size(1) < self.special_token_size:
                # Pad with zeros
                padding = torch.zeros(
                    entity_embeddings.size(0),
                    self.special_token_size - entity_embeddings.size(1),
                    device=entity_embeddings.device
                )
                entity_embeddings = torch.cat([entity_embeddings, padding], dim=1)
            else:
                # Truncate
                entity_embeddings = entity_embeddings[:, :self.special_token_size]

        # Apply linear transformation and activation
        encoded = self.linear(entity_embeddings)
        return self.activation(encoded)

class CustomMarianMTModel(MarianMTModel):
    def __init__(self, config, hidden_size=512, special_token_size=206573, biomedicalEncoder=None):
        super().__init__(config)
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Initialize biomedical encoder within the model
        if biomedicalEncoder == None:
            self.biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
        else:
            self.biomedical_encoder = biomedicalEncoder

        # Entity embedding for special tokens
        self.entity_embedding = nn.Embedding(special_token_size + 1, hidden_size)  # +1 for padding token

        # Projection layer to match vocabulary size
        self.entity_projection = nn.Linear(hidden_size, config.vocab_size)

    def save_custom(self, save_directory):
        # Create save directory if it doesn't exist
        os.makedirs(save_directory, exist_ok=True)

        model_save_path = os.path.join(save_directory, "model")
        print(model_save_path)
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        os.makedirs(model_save_path, exist_ok=True)
        os.makedirs(tokenizer_save_path, exist_ok=True)

        # Save the model and its configuration
        self.save_pretrained(model_save_path)

        # Save the biomedical encoder's state_dict
        torch.save(self.biomedical_encoder.state_dict(), os.path.join(model_save_path, "biomedical_encoder.pth"))

        # Save custom attributes in a JSON file
        custom_config = {
            "hidden_size": self.hidden_size,
            "special_token_size": self.special_token_size,
        }
        with open(os.path.join(model_save_path, "custom_config.json"), "w") as f:
            json.dump(custom_config, f)

        if tokenizer is not None:
            tokenizer.save_pretrained(tokenizer_save_path)

    @classmethod
    def from_custom(cls, save_directory):
        model_save_path = os.path.join(save_directory, "model")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load custom attributes from JSON
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Load base model configuration
        model = MarianMTModel.from_pretrained(model_save_path)

        # Create a new CustomMarianMTModel with the loaded configuration
        new_model = cls(
            config=model.config,
            hidden_size=custom_config["hidden_size"],
            special_token_size=custom_config["special_token_size"]
        )

        # Load the biomedical encoder state dict
        biomedical_encoder_path = os.path.join(model_save_path, "biomedical_encoder.pth")
        biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)
        new_model.biomedical_encoder.load_state_dict(biomedical_encoder_state_dict)

        # Load the main model weights
        state_dict = model.state_dict()
        new_model_state_dict = new_model.state_dict()

        # Update the state dictionary, keeping the biomedical encoder weights
        for key, value in state_dict.items():
            if key in new_model_state_dict:
                new_model_state_dict[key] = value

        new_model.load_state_dict(new_model_state_dict, strict=False)

        # Load tokenizer
        tokenizer = MarianTokenizer.from_pretrained(tokenizer_save_path)

        return new_model, tokenizer

    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
        # Perform base MarianMT forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

        # Process entity information if provided
        if entity_ids is not None:
            try:
                # Ensure entity_ids is a tensor with 2 dimensions
                if len(entity_ids.shape) == 1:
                    entity_ids = entity_ids.unsqueeze(0)

                # Get batch size, sequence length, and vocab size from outputs
                batch_size = outputs.logits.size(0)
                sequence_length = outputs.logits.size(1)
                vocab_size = outputs.logits.size(2)

                # Ensure entity_ids is on the same device as outputs.logits
                entity_ids = entity_ids.to(outputs.logits.device)

                # Limit entity_ids to current batch size
                entity_ids = entity_ids[:batch_size]

                if torch.any(entity_ids >= self.entity_embedding.num_embeddings):
                    print(f"Invalid entity IDs detected: {entity_ids}")
                    raise ValueError("Entity IDs are out of bounds for the embedding layer")


                # Get embeddings for entity special tokens
                entity_embeddings = self.entity_embedding(entity_ids)

                # Ensure embeddings are on the correct device
                entity_embeddings = entity_embeddings.to(outputs.logits.device)

                # Process through biomedical encoder
                original_shape = entity_embeddings.shape
                entity_features = self.biomedical_encoder(entity_embeddings.view(-1, original_shape[-1]))

                # Reshape back to original batch and entity dimension
                entity_features = entity_features.view(original_shape[0], original_shape[1], -1)

                # Project entity features to match logits dimensionality
                entity_logits = self.entity_projection(entity_features)

                # Ensure logits are on the correct device
                entity_logits = entity_logits.to(outputs.logits.device)

                # Create a tensor of zeros with the same shape as outputs.logits
                expanded_entity_logits = torch.zeros_like(outputs.logits)

                # Adjust logits shape to match the entity length
                min_entities = min(entity_logits.size(1), expanded_entity_logits.size(1))
                min_vocab = min(entity_logits.size(2), expanded_entity_logits.size(2))

                expanded_entity_logits[:, :min_entities, :min_vocab] = entity_logits[:, :min_entities, :min_vocab]

                # Add entity-based logits to original logits
                outputs.logits = outputs.logits + expanded_entity_logits

            except Exception as e:
                print(f"Error in forward method: {e}")
                raise
        torch.cuda.synchronize()
        return outputs



def prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english", max_entities=5):
    def preprocess_function(examples):
        # Ensure inputs are lists
        src_sentences = examples[src_lang]
        tgt_sentences = examples[tgt_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])

        processed_src = []
        processed_entities = []

        for sentence, entities in zip(src_sentences, entities_list):
            # Ensure sentence is a string and remove existing spaces
            sentence = str(sentence).replace(" ", "")

            # Add special tokens for entities
            for entity in entities:
                sentence = sentence.replace(entity, f"<<{entity}>>")

            processed_src.append(sentence)

            # Convert entities to token IDs
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>")
                for entity in entities
            ]

            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))

            # Debugging: Log entity ids and padding
            # print(f"Entity IDs (after padding/truncation): {entity_ids}")

            processed_entities.append(entity_ids)

        # Tokenize source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Tokenize target sentences
        labels = tokenizer(
            tgt_sentences,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]

        # Convert entity_ids to tensor
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)

        return model_inputs

    # Apply preprocessing to the dataset
    processed_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    return processed_dataset

def fine_tune_custom_model(custom_model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        save_safetensors=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
        fp16=False  # Disable mixed precision
    )

    # Create trainer
    trainer = Seq2SeqTrainer(
        model=custom_model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()


In [None]:
model_save_path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/before-training"
# Loading custom model
custom_model, tokenizer = CustomMarianMTModel.from_custom(model_save_path)

  biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)


In [None]:
tokenized_dataset = load_dataset("parquet", data_files="/content/drive/MyDrive/266 Data Project/corpora/nejm/zh-en-tokenized-train-working-model.parquet")["train"]

output_dir = "/content/drive/MyDrive/266 Data Project/corpora/nejm/custom_models/"

# Before training
# Move all components to GPU before training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)
# tokenizer = tokenizer.to(device)

# Add explicit error checking
torch.cuda.empty_cache()  # Clear GPU memory before training

# Fine-tune the model
fine_tune_custom_model(
    custom_model,
    tokenizer,
    tokenized_dataset,
    output_dir
)

Generating train split: 0 examples [00:00, ? examples/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,1.4312,1.321974
2,1.1628,1.096767
3,1.1025,1.036248


In [None]:
# Saving custom model
model_save_path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/after-training"
custom_model.save_custom(model_save_path)


/content/drive/MyDrive/266 Data Project/corpora/nejm/after-training/model


In [None]:
from evaluate import load
import torch

def add_special_tokens(tokenizer, entities):
    """
    Adds new entity tokens to the tokenizer if they are not already present.

    Args:
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to update.
        entities (list of str): List of entity names to add as special tokens.

    Returns:
        None
    """
    special_tokens = [f"<<{entity}>>" for entity in entities]
    added_tokens = [token for token in special_tokens if token not in tokenizer.get_vocab()]
    if added_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': added_tokens})
        print(f"Added new special tokens: {added_tokens}")


def translate_tokenized_dataset(model, tokenizer, tokenized_dataset, batch_size=32):
    translations = []

    model.eval()

    for i in range(0, len(tokenized_dataset), batch_size):
        # Extract batch data
        input_ids = tokenized_dataset["input_ids"][i:i + batch_size]
        attention_mask = tokenized_dataset["attention_mask"][i:i + batch_size]
        entity_ids = tokenized_dataset["entity_ids"][i:i + batch_size]

        # Convert to tensors with explicit type and device handling
        input_ids = torch.tensor(input_ids, dtype=torch.long).to(model.device)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(model.device)
        entity_ids = torch.tensor(entity_ids, dtype=torch.long).to(model.device)

        # Debug print statements
        print(f"Batch {i//batch_size + 1}:")
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention Mask shape: {attention_mask.shape}")
        print(f"Entity IDs shape: {entity_ids.shape}")
        print(f"Entity IDs min: {entity_ids.min()}, max: {entity_ids.max()}")
        print(f"Model entity embedding size: {model.entity_embedding.num_embeddings}")

        # Validate entity_ids before generation
        try:
            # Check if all entity IDs are within the valid range
            assert torch.all(entity_ids >= 0), "Negative entity IDs found"
            assert torch.all(entity_ids < model.entity_embedding.num_embeddings), "Out-of-bound entity IDs"
        except AssertionError as e:
            print(f"Entity ID validation error: {e}")
            # Skip this batch or handle the error as needed
            continue

        # Generate translations
        try:
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    entity_ids=entity_ids
                )
        except Exception as e:
            print(f"Generation error in batch {i//batch_size + 1}: {e}")
            continue

        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)

    return translations


# Define evaluation metrics
def evaluate_model_metrics(predictions, references, save_path=None):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)

    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)

    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)

    # Extract summary statistics for BERTScore
    bertscore_summary = {
        "mean": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
        "median": sorted(bertscore_result["f1"])[len(bertscore_result["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))**2 for x in bertscore_result["f1"]) / len(bertscore_result["f1"]))**0.5
    }

    # Consolidate results
    results = {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_summary,
        "TER": ter_result,
    }

    return results

In [None]:
def preprocess_function(examples, tokenizer, src_lang="chinese", max_entities=5):
    src_sentences = examples[src_lang]
    entities_list = examples.get("entities", [[] for _ in src_sentences])

    processed_src = []
    processed_entities = []

    for sentence, entities in zip(src_sentences, entities_list):
        # Process source sentence (add markers for entities in vocabulary)
        for entity in entities:
            if f"<<{entity}>>" in tokenizer.get_vocab():
                sentence = sentence.replace(entity, f"<<{entity}>>")
        processed_src.append(sentence)

        # Convert entities to token IDs (if in vocab)
        entity_ids = [
            tokenizer.convert_tokens_to_ids(f"<<{entity}>>")
            if f"<<{entity}>>" in tokenizer.get_vocab() else 0
            for entity in entities
        ]

        # Pad or truncate entity_ids
        entity_ids = entity_ids[:max_entities]
        entity_ids += [0] * (max_entities - len(entity_ids))  # Pad with zeros
        processed_entities.append(entity_ids)

    # Tokenize the processed source sentences
    model_inputs = tokenizer(
        processed_src,
        max_length=128,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    # Add entity_ids as a tensor to the inputs
    model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)

    return model_inputs

def preprocess_test_data(test_dataset, tokenizer, src_lang="chinese", max_entities=5):
    """
    Preprocess test data to tokenize inputs and add entity_ids for entity-based embeddings.
    """
    # Wrap preprocess_function with fixed arguments
    def wrapped_preprocess_function(examples):
        return preprocess_function(
            examples, tokenizer=tokenizer, src_lang=src_lang, max_entities=max_entities
        )

    # Apply the preprocessing function to the test dataset
    processed_test_dataset = test_dataset.map(
        wrapped_preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
    )

    return processed_test_dataset


def evaluate_model(model, tokenized_test_dataset, tokenizer, batch_size=16):
    """
    Evaluate the model on the tokenized test dataset.
    """
    # Prepare DataLoader for test data
    test_loader = torch.utils.data.DataLoader(
        tokenized_test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: tokenizer.pad(batch, return_tensors="pt")
    )

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            # Move inputs to GPU if available
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}

            # Generate predictions
            outputs = model.generate(**inputs)
            predictions.extend(outputs)

    return predictions


In [None]:
# Load your test dataset
test_dataset = load_dataset("parquet", data_files={"test": "/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_test_entities.parquet"})["test"]

# Preprocess and tokenize test data
tokenized_test_dataset = preprocess_test_data(test_dataset, tokenizer)

# Evaluate model on test data
predictions = evaluate_model(custom_model, tokenized_test_dataset, tokenizer)
predictions[0]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]



tensor([65000,     7,  1813,     7, 20212,    86, 15398,     4,     7,  8528,
         1552,   748,     7,     2,     7,  1813,     7,  4435, 21845,     0],
       device='cuda:0')

In [None]:
pd.DataFrame(data={"predicted_english_tokens": [x.cpu().numpy() for x in predictions]}).to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/post_training_predictions.parquet")

In [None]:
# Decode predictions to text
decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
print("Predictions:", decoded_predictions[0:5])

Predictions: ['an interaction of protein , an immu', 'efficacy antibody therapy with an', 'safety profile of antibody @-@', 'in this phase 1 trial , phase 1', 'the primary end point was determined by the']


In [None]:
pd.DataFrame(data={"predicted_english": decoded_predictions}).to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/post_training_predictions_detokenized.parquet")

In [None]:
results = evaluate_model_metrics(decoded_predictions, test_dataset["english"])
results

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': {'bleu': 0.002118818585985407,
  'precisions': [0.4381300219138057,
   0.16612012426648257,
   0.0851063829787234,
   0.045742814734853594],
  'brevity_penalty': 0.0163311288042505,
  'length_ratio': 0.19551556698086262,
  'translation_length': 13690,
  'reference_length': 70020},
 'ROUGE': {'rouge1': 0.1560821290834139,
  'rouge2': 0.04536898385243797,
  'rougeL': 0.14204262472165466,
  'rougeLsum': 0.14213386198079658},
 'BERTScore': {'mean': 0.8180702171305267,
  'median': 0.8156791925430298,
  'std': 0.027420137574809416},
 'TER': {'score': 92.98824531257128,
  'num_edits': 61150,
  'ref_length': 65761.0}}

In [None]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.0.0 sacrebleu-2.4.3


In [None]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=3a821853eb6d5d97cda169be67a5529f46a6c4f0f2a7881fa49a73598be9274f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
temp = tokenizer.convert_ids_to_tokens(predictions[0])
temp

['<pad>',
 '▁',
 'an',
 '▁',
 'inter',
 'a',
 'ction',
 '▁of',
 '▁',
 'pro',
 'te',
 'in',
 '▁',
 ',',
 '▁',
 'an',
 '▁',
 'im',
 'mu',
 '</s>']