In [1]:
pip install huggingface rouge_score bert_score sacrebleu

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m62.1 kB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (1

In [2]:
pip install datasets transformers evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [3]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Define the data directory inside your Google Drive
# data_dir = "/content/drive/My Drive/Colab Notebooks/corpora"
data_dir = "/content/drive/My Drive/266 Data Project/corpora"

Mounted at /content/drive


In [4]:
import os

# Set the environment variable
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Import PyTorch after setting the environment variable
import torch

# Optionally check if the variable is set correctly
print("PYTORCH_CUDA_ALLOC_CONF:", os.environ.get('PYTORCH_CUDA_ALLOC_CONF'))



PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True


In [5]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset, load_dataset
import torch.nn.functional as F
import os
import json
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

def load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens):
    # Load tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(special_tokens))
    })

    # Load base model
    model = MarianMTModel.from_pretrained(model_name)

    # Create custom model, CustomMarianMTModel will create a BiomedicalEncoder object in init()
    custom_model = CustomMarianMTModel(
        config=model.config,
        hidden_size=hidden_size,
        special_token_size=len(special_tokens),
    )

    # Resize token embeddings
    custom_model.resize_token_embeddings(len(tokenizer))

    return tokenizer, custom_model

class BiomedicalEncoder(nn.Module):
    def __init__(self, hidden_size, special_token_size):
        super(BiomedicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Adjust the linear layer to match input dimensions
        self.linear = nn.Linear(special_token_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, entity_embeddings):
        # Reshape entity embeddings if necessary
        original_shape = entity_embeddings.shape

        # Flatten the tensor if it has more than 2 dimensions
        if len(original_shape) > 2:
            entity_embeddings = entity_embeddings.view(-1, original_shape[-1])

        # Ensure the input matches the expected dimension
        if entity_embeddings.size(1) != self.special_token_size:
            # If the input doesn't match, pad or truncate
            if entity_embeddings.size(1) < self.special_token_size:
                # Pad with zeros
                padding = torch.zeros(
                    entity_embeddings.size(0),
                    self.special_token_size - entity_embeddings.size(1),
                    device=entity_embeddings.device
                )
                entity_embeddings = torch.cat([entity_embeddings, padding], dim=1)
            else:
                # Truncate
                entity_embeddings = entity_embeddings[:, :self.special_token_size]

        # Apply linear transformation and activation
        encoded = self.linear(entity_embeddings)
        return self.activation(encoded)

class CustomMarianMTModel(MarianMTModel):
    def __init__(self, config, hidden_size=512, special_token_size=206573, biomedicalEncoder=None):
        super().__init__(config)
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Initialize biomedical encoder
        if biomedicalEncoder is None:
            self.biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
        else:
            self.biomedical_encoder = biomedicalEncoder

        # Entity embedding layer
        self.entity_embedding = nn.Embedding(special_token_size + 1, hidden_size)  # +1 for padding token

    def save_custom(self, save_directory):
        # Save the model and its configuration
        model_save_path = os.path.join(save_directory, "model")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")
        os.makedirs(model_save_path, exist_ok=True)
        os.makedirs(tokenizer_save_path, exist_ok=True)

        self.save_pretrained(model_save_path)
        torch.save(self.biomedical_encoder.state_dict(), os.path.join(model_save_path, "biomedical_encoder.pth"))

        # Save custom config in JSON
        custom_config = {
            "hidden_size": self.hidden_size,
            "special_token_size": self.special_token_size,
        }
        with open(os.path.join(model_save_path, "custom_config.json"), "w") as f:
            json.dump(custom_config, f)

    @classmethod
    def from_custom(cls, save_directory):
        model_save_path = os.path.join(save_directory, "model")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load custom attributes from JSON
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Load base model and tokenizer
        model = MarianMTModel.from_pretrained(model_save_path)
        new_model = cls(config=model.config, hidden_size=custom_config["hidden_size"], special_token_size=custom_config["special_token_size"])
        biomedical_encoder_path = os.path.join(model_save_path, "biomedical_encoder.pth")
        biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)
        new_model.biomedical_encoder.load_state_dict(biomedical_encoder_state_dict)

        # Load the main model weights
        state_dict = model.state_dict()
        new_model_state_dict = new_model.state_dict()
        for key, value in state_dict.items():
            if key in new_model_state_dict:
                new_model_state_dict[key] = value
        new_model.load_state_dict(new_model_state_dict, strict=False)

        # Load tokenizer
        tokenizer = MarianTokenizer.from_pretrained(tokenizer_save_path)
        return new_model, tokenizer

    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
      # Get token embeddings (from MarianMTModel's embeddings)
      token_embeddings = self.model.encoder.embed_tokens(input_ids)

      # Manually align entity embeddings with token embeddings
      if entity_ids is not None:
          entity_embeddings = self.entity_embedding(entity_ids)  # (batch_size, max_entities, hidden_size)
          entity_features = self.biomedical_encoder(entity_embeddings)  # (batch_size, max_entities, hidden_size)

          # Ensure entity_features match the sequence length of token embeddings
          batch_size, seq_len, _ = token_embeddings.size()
          if entity_features.size(1) != seq_len:
              # Adjust entity features to match sequence length (e.g., by repeating or slicing)
              entity_features = entity_features[:, :seq_len, :]  # Truncate to match sequence length

          # Add the processed entity embeddings to the token embeddings
          token_embeddings += entity_features

      # Forward the augmented embeddings to the encoder
      encoder_outputs = self.model.encoder(
          inputs_embeds=token_embeddings,
          attention_mask=attention_mask,
          return_dict=True
      )

      # Pass encoder outputs to the decoder
      outputs = self.model.decoder(
          input_ids=None,
          attention_mask=attention_mask,
          encoder_hidden_states=encoder_outputs.last_hidden_state,
          encoder_attention_mask=attention_mask,
          labels=labels,
          return_dict=True
      )

      return outputs


def preprocess_function(examples, tokenizer, max_entities=5, src_lang="chinese", tgt_lang="english"):
    # Extract sentences and entities
    src_sentences = examples[src_lang]
    tgt_sentences = examples[tgt_lang]

    # Get entities or assign empty lists if not provided
    entities_list = examples.get("entities", [None for _ in src_sentences])  # None if entities are missing

    # Tokenize source and target sentences
    model_inputs = tokenizer(
        src_sentences,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    labels = tokenizer(
        tgt_sentences,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100

    # Process the entities for each sentence
    processed_entities = []
    for entities in entities_list:
        if entities is None:
            # If no entities, pad with zeros
            processed_entities.append([0] * max_entities)
        else:
            # Instead of aligning the entities manually, just get the entity IDs
            entity_ids = []
            for entity in entities:
                # Convert each entity to its corresponding token ID (special token handling)
                entity_token_ids = tokenizer.convert_tokens_to_ids([entity])
                entity_ids.extend(entity_token_ids)

            # Pad or truncate entity list to match max_entities
            if len(entity_ids) < max_entities:
                # Pad with zeros if fewer than max_entities
                entity_ids += [0] * (max_entities - len(entity_ids))
            else:
                # Truncate if more than max_entities
                entity_ids = entity_ids[:max_entities]

            processed_entities.append(entity_ids)

    # Add entity IDs to the model inputs
    model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)

    return model_inputs

def prepare_dataset(dataset, tokenizer, max_entities=5, src_lang="chinese", tgt_lang="english"):
    # Apply the preprocessing function with batch processing and parallelism
    processed_dataset = dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, max_entities, src_lang, tgt_lang),
        batched=True,
        remove_columns=dataset.column_names
    )

    return processed_dataset



def fine_tune_model(model, tokenizer, dataset, output_dir):
  # Split dataset
    dataset = dataset.train_test_split(test_size=0.1)
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1.5,
        weight_decay=0.01,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    trainer.train()




In [6]:
# Load pretrained MarianMT model with custom biomedical encoder
output_dir = "/content/drive/MyDrive/266 Data Project/corpora/nejm/custom_models-mk4/"
named_entities_df = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/zh-entities.parquet")
special_tokens = ["".join(x) for x in named_entities_df["tokens"].tolist()]
hidden_size = 512
tokenizer, model = load_marian_with_biomedical_layer('Helsinki-NLP/opus-mt-zh-en', hidden_size, special_tokens)

# Load dataset
dataset = load_dataset("parquet", data_files="/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_train_entities.parquet")["train"]

# Prepare the dataset
# prepared_dataset = prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english")

# prepared_dataset.to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/mk4_tokenized.parquet")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# custom_model = custom_model.to(device)
# torch.cuda.empty_cache()  # Clear GPU memory before training
# # Fine-tune the model
# fine_tune_model(model, tokenizer, prepared_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Generating train split: 0 examples [00:00, ? examples/s]



Map (num_proc=4):   0%|          | 0/62127 [00:00<?, ? examples/s]

TimeoutError: 

In [25]:
model_save_path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/before-training-mk4"
# Loading custom model
custom_model.save_custom(model_save_path)



In [6]:
model_save_path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/before-training-mk4"
custom_model, tokenizer = CustomMarianMTModel.from_custom(model_save_path)

  biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)


OSError: Can't load tokenizer for '/content/drive/MyDrive/266 Data Project/corpora/nejm/before-training-mk4/tokenizer'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/content/drive/MyDrive/266 Data Project/corpora/nejm/before-training-mk4/tokenizer' is the correct path to a directory containing all relevant files for a MarianTokenizer tokenizer.

In [13]:
# prepared_dataset = prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english")

# prepared_dataset.to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/mk4_tokenized.parquet")

path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/mk4_tokenized.parquet"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = model
custom_model = custom_model.to(device)
torch.cuda.empty_cache()  # Clear GPU memory before training
# Fine-tune the model
# fine_tune_model(model, tokenizer, prepared_dataset, output_dir)


Map:   0%|          | 0/62127 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/63 [00:00<?, ?ba/s]



KeyError: "Column train not in the dataset. Current columns in the dataset: ['input_ids', 'attention_mask', 'labels', 'entity_ids']"

In [22]:
fine_tune_model(model, tokenizer, prepared_dataset, output_dir)

  trainer = Seq2SeqTrainer(


RuntimeError: The size of tensor a (16) must match the size of tensor b (80) at non-singleton dimension 0

In [None]:
# Saving custom model
model_save_path = "/content/drive/MyDrive/266 Data Project/corpora/nejm/after-training-mk4"
custom_model.save_custom(model_save_path)


In [None]:
from evaluate import load
import torch

def translate_tokenized_dataset(model, tokenizer, tokenized_dataset, batch_size=32):
    translations = []

    model.eval()

    for i in range(0, len(tokenized_dataset), batch_size):
        # Extract batch data
        input_ids = tokenized_dataset["input_ids"][i:i + batch_size]
        attention_mask = tokenized_dataset["attention_mask"][i:i + batch_size]
        entity_ids = tokenized_dataset["entity_ids"][i:i + batch_size]

        # Convert to tensors with explicit type and device handling
        input_ids = torch.tensor(input_ids, dtype=torch.long).to(model.device)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(model.device)
        entity_ids = torch.tensor(entity_ids, dtype=torch.long).to(model.device)

        # Debug print statements
        print(f"Batch {i//batch_size + 1}:")
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention Mask shape: {attention_mask.shape}")
        print(f"Entity IDs shape: {entity_ids.shape}")
        print(f"Entity IDs min: {entity_ids.min()}, max: {entity_ids.max()}")
        print(f"Model entity embedding size: {model.entity_embedding.num_embeddings}")

        # Validate entity_ids before generation
        try:
            # Check if all entity IDs are within the valid range
            assert torch.all(entity_ids >= 0), "Negative entity IDs found"
            assert torch.all(entity_ids < model.entity_embedding.num_embeddings), "Out-of-bound entity IDs"
        except AssertionError as e:
            print(f"Entity ID validation error: {e}")
            # Skip this batch or handle the error as needed
            continue

        # Generate translations
        try:
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    entity_ids=entity_ids,
                    max_length=50,  # Increase this to the desired total length
                    num_beams=5,  # Optional: For beam search
                    do_sample=False  # Optional: To control randomness
                )

        except Exception as e:
            print(f"Generation error in batch {i//batch_size + 1}: {e}")
            continue

        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)

    return translations


# Define evaluation metrics
def evaluate_model_metrics(predictions, references, save_path=None):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)

    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)

    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)

    # Extract summary statistics for BERTScore
    bertscore_summary = {
        "mean": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
        "median": sorted(bertscore_result["f1"])[len(bertscore_result["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))**2 for x in bertscore_result["f1"]) / len(bertscore_result["f1"]))**0.5
    }

    # Consolidate results
    results = {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_summary,
        "TER": ter_result,
    }

    return results

In [None]:
def preprocess_function(examples, tokenizer, src_lang="chinese", max_entities=5):
    src_sentences = examples[src_lang]
    entities_list = examples.get("entities", [[] for _ in src_sentences])

    processed_src = []
    processed_entities = []

    for sentence, entities in zip(src_sentences, entities_list):
        processed_src.append(sentence)

        # Convert entities to token IDs (if in vocab)
        entity_ids = [
            tokenizer.convert_tokens_to_ids(entity)
            if entity in tokenizer.get_vocab() else 0
            for entity in entities
        ]

        # Pad or truncate entity_ids
        entity_ids = entity_ids[:max_entities]
        entity_ids += [0] * (max_entities - len(entity_ids))  # Pad with zeros
        processed_entities.append(entity_ids)

    # Tokenize the processed source sentences
    model_inputs = tokenizer(
        processed_src,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    # Add entity_ids as a tensor to the inputs
    model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)

    return model_inputs

def preprocess_test_data(test_dataset, tokenizer, src_lang="chinese", max_entities=5):
    """
    Preprocess test data to tokenize inputs and add entity_ids for entity-based embeddings.
    """
    # Wrap preprocess_function with fixed arguments
    def wrapped_preprocess_function(examples):
        return preprocess_function(
            examples, tokenizer=tokenizer, src_lang=src_lang, max_entities=max_entities
        )

    # Apply the preprocessing function to the test dataset
    processed_test_dataset = test_dataset.map(
        wrapped_preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
    )

    return processed_test_dataset

def evaluate_model(model, tokenized_test_dataset, tokenizer, batch_size=16, max_length=50, num_beams=5, do_sample=False):
    """
    Evaluate the model on the tokenized test dataset.

    Args:
        model: The model to evaluate.
        tokenized_test_dataset: Tokenized test dataset.
        tokenizer: The tokenizer used for tokenization.
        batch_size: Batch size for evaluation.
        max_length: The maximum total sequence length (input + output).
        num_beams: Number of beams for beam search.
        do_sample: Whether to sample during generation.

    Returns:
        List of predictions.
    """
    # Prepare DataLoader for test data
    test_loader = torch.utils.data.DataLoader(
        tokenized_test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: tokenizer.pad(batch, return_tensors="pt")
    )

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            # Move inputs to GPU if available
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}

            # Generate predictions with specified parameters
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                do_sample=do_sample
            )
            predictions.extend(outputs)

    return predictions

In [None]:
# Load your test dataset
test_dataset = load_dataset("parquet", data_files={"test": "/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_test_entities.parquet"})["test"]

# Preprocess and tokenize test data
tokenized_test_dataset = preprocess_test_data(test_dataset, tokenizer)

# # Evaluate model on test data
# predictions = evaluate_model(custom_model, tokenized_test_dataset, tokenizer)
# predictions[0]

In [None]:
# Evaluate model on test data
predictions = evaluate_model(custom_model, tokenized_test_dataset, tokenizer)
predictions[0]

In [None]:
temp = tokenizer.convert_ids_to_tokens(predictions[0])
temp

In [None]:
pd.DataFrame(data={"predicted_english_tokens": [x.cpu().numpy() for x in predictions]}).to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/mk4_post_training_predictions.parquet")

In [None]:
# Decode predictions to text
decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
print("Predictions:", decoded_predictions[0:5])

In [None]:
pd.DataFrame(data={"predicted_english": decoded_predictions}).to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/mk4_post_training_predictions_detokenized.parquet")

In [None]:
results = evaluate_model_metrics(decoded_predictions, test_dataset["english"])
results