In [None]:
pip install huggingface rouge_score bert_score sacrebleu datasets transformers evaluate

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Colle

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

data_dir = "/content/drive/My Drive/266 Data Project/corpora"

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import transformers
import gensim
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd
import os
import json
from tqdm.autonotebook import trange, tqdm

In [None]:
# import torch
# import torch.nn.functional as F
# def extract_entities(text, model, tokenizer):
#     # Tokenize the input text
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

#     # Perform inference
#     with torch.no_grad():
#         outputs = model(**inputs).logits

#     # Get the predicted token labels
#     predictions = torch.argmax(outputs, dim=-1)

#     # Convert token IDs back to tokens
#     tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

#     # Get model labels (you can adjust based on the model’s configuration)
#     labels = model.config.id2label

#     # Extract named entities and their scores
#     entities = []
#     for token, prediction, logits in zip(tokens, predictions[0], outputs[0]):
#         if prediction != 0:  # Assuming '0' is the 'O' tag for non-entities
#             # Get the score (probability) for the predicted class
#             class_score = F.softmax(logits, dim=-1)[prediction.item()].item()
#             entities.append((token, labels[prediction.item()], class_score))

#     return entities

In [None]:
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# import pandas as pd
# import tqdm
# import os

# # Load the first model: venkatd/BioMed_NER
# model1_name = "venkatd/BioMed_NER"
# tokenizer1 = AutoTokenizer.from_pretrained(model1_name)
# model1 = AutoModelForTokenClassification.from_pretrained(model1_name)

# # Sample biomedical text in Chinese
# df = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_train.parquet")
# texts = df.english.tolist()
# # Extract entities
# english_entities = []
# for text in tqdm.tqdm(texts):
#     # print(f"Text: {text}")
#     entities = extract_entities(text, model1, tokenizer1)
#     for i in entities:
#         if i[2] >= .20:
#             english_entities.append(entities)

# df_entities_en = pd.DataFrame({"entities": english_entities})
# df_entities_en.to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/en-entities.parquet")

In [None]:
english_entities = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/eng-list.parquet").entity.unique().tolist()

In [None]:
class TranslationDataset:
    """
    Prepare tokenized datasets for training and evaluation without relying on DataLoader.
    """
    @staticmethod
    def prepare_dataset(english_texts, chinese_texts, tokenizer):
        # Tokenize parallel corpus
        tokenized_data = {
            "source": tokenizer(
                english_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ),
            "target": tokenizer(
                chinese_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            )
        }

        # Prepare data dictionary for Hugging Face Dataset
        dataset_dict = {
            "input_ids": tokenized_data["source"]["input_ids"],
            "attention_mask": tokenized_data["source"]["attention_mask"],
            "labels": tokenized_data["target"]["input_ids"]
        }

        # Convert to Hugging Face Dataset
        return Dataset.from_dict({key: value.tolist() for key, value in dataset_dict.items()})


class BiomedicalMarianMTEnhancer(nn.Module):
    """
    Wraps MarianMT with additional medical term embeddings.
    """
    def __init__(self, base_model, tokenizer, biowordvec_path):
        super().__init__()
        self.base_model = base_model
        self.tokenizer = tokenizer

        # Load BioWordVec embeddings
        self.biowordvec = gensim.models.KeyedVectors.load_word2vec_format(
            biowordvec_path,
            binary=True
        )

        # Create a custom embedding layer for medical terms
        embedding_dim = self.biowordvec.vector_size
        vocab_size = base_model.config.vocab_size

        # Create a custom embedding layer
        self.medical_embedding_layer = nn.Embedding(
            vocab_size,
            embedding_dim
        )

        # Initialize medical embedding layer
        self._init_medical_embeddings()

        # Additional projection layer to align embeddings
        self.projection = nn.Linear(
            embedding_dim,
            base_model.config.d_model
        )

    def _init_medical_embeddings(self):
        weight = self.medical_embedding_layer.weight.data

        for token, idx in self.tokenizer.get_vocab().items():
            clean_token = token.replace('▁', '').strip()

            try:
                # Try to get embedding for the token
                vec = self.biowordvec[clean_token]
                weight[idx] = torch.tensor(vec)
            except KeyError:
                # Fallback to default initialization
                nn.init.xavier_uniform_(weight[idx].unsqueeze(0))

    def forward(self, input_ids, labels=None, attention_mask=None):
        # Get base model embeddings
        base_embeddings = self.base_model.model.get_input_embeddings()(input_ids)

        # Get medical term embeddings
        medical_embeddings = self.medical_embedding_layer(input_ids)

        # Project medical embeddings
        projected_medical_embeddings = self.projection(medical_embeddings)

        # Combine base and medical embeddings
        combined_embeddings = base_embeddings + projected_medical_embeddings

        # Continue with standard MarianMT forward pass
        outputs = self.base_model(
            inputs_embeds=combined_embeddings,
            attention_mask=attention_mask,
            labels=labels
        )

        return outputs

    def generate(self, input_ids=None, attention_mask=None, **kwargs):
        """
        Generate translations with custom embeddings and pass them into MarianMT method as input_embeddings
        """
        if input_ids is not None:
            # Compute the base embeddings
            base_embeddings = self.base_model.model.get_input_embeddings()(input_ids)

            # Compute the medical term embeddings
            medical_embeddings = self.medical_embedding_layer(input_ids)

            # Project medical embeddings
            projected_medical_embeddings = self.projection(medical_embeddings)

            # Combine base and medical embeddings
            combined_embeddings = base_embeddings + projected_medical_embeddings

            # Use the combined embeddings for generation
            return self.base_model.generate(
                inputs_embeds=combined_embeddings,
                attention_mask=attention_mask,
                **kwargs
            )
        else:
            raise ValueError("`input_ids` must be provided for generating embeddings.")

    def save_custom(self, save_directory, tokenizer=None):
        """
        Save the model and custom embeddings.
        """
        os.makedirs(save_directory, exist_ok=True)

        # Paths
        model_save_path = os.path.join(save_directory, "model")
        embedding_save_path = os.path.join(model_save_path, "medical_embeddings.pth")
        projection_save_path = os.path.join(model_save_path, "projection_layer.pth")
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        os.makedirs(model_save_path, exist_ok=True)

        # Save the base model
        self.base_model.save_pretrained(model_save_path)

        # Save the medical embedding and projection layer
        torch.save(self.medical_embedding_layer.state_dict(), embedding_save_path)
        torch.save(self.projection.state_dict(), projection_save_path)

        # Save custom configuration
        custom_config = {
            "embedding_dim": self.medical_embedding_layer.embedding_dim,
            "vocab_size": self.medical_embedding_layer.num_embeddings
        }
        with open(custom_config_path, "w") as f:
            json.dump(custom_config, f)

        # Save tokenizer
        if tokenizer is not None:
            tokenizer.save_pretrained(tokenizer_save_path)

    def from_custom(cls, save_directory):
        """
        Load the model and custom embeddings.
        """
        # Paths
        model_save_path = os.path.join(save_directory, "model")
        embedding_save_path = os.path.join(model_save_path, "medical_embeddings.pth")
        projection_save_path = os.path.join(model_save_path, "projection_layer.pth")
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load the base model
        base_model = transformers.MarianMTModel.from_pretrained(model_save_path)

        # Load custom configuration
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Extract custom configuration values
        embedding_dim = custom_config.get("embedding_dim")
        vocab_size = custom_config.get("vocab_size")

        # Create an instance of the enhanced model
        enhancer = cls(
            base_model=base_model,
            tokenizer=None,  # Replace with tokenizer if required
            biowordvec_path=None  # BioWordVec is not reloaded here
        )

        # Resize and initialize the medical embedding layer based on the saved config
        enhancer.medical_embedding_layer = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )

        # Load the medical embedding and projection layer states
        medical_embedding_state = torch.load(embedding_save_path)
        projection_state = torch.load(projection_save_path)
        enhancer.medical_embedding_layer.load_state_dict(medical_embedding_state)
        enhancer.projection.load_state_dict(projection_state)

        # Load tokenizer
        tokenizer = transformers.MarianTokenizer.from_pretrained(tokenizer_save_path)

        return enhancer, tokenizer



def train_biomedical_translation_model(
    base_model,
    tokenizer,
    english_texts,
    chinese_texts,
    biowordvec_path,
    test_size=0.1,
    batch_size=16,
    learning_rate=1e-4,
    num_train_epochs=3,
    output_dir="./results"
):
    # Prepare datasets
    full_dataset = TranslationDataset.prepare_dataset(english_texts, chinese_texts, tokenizer)
    split_dataset = full_dataset.train_test_split(test_size=test_size, seed=42)

    # Wrap the base model with the enhancer
    enhanced_model = BiomedicalMarianMTEnhancer(
        base_model,
        tokenizer,
        biowordvec_path
    )

    # Define Seq2Seq training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_safetensors=False,
        num_train_epochs=num_train_epochs,
        logging_dir="./logs",
        logging_steps=500,
        predict_with_generate=True,  # This is essential for seq2seq tasks like translation
        generation_num_beams=3,  # Beam search during generation
        # load_best_model_at_end=True
    )

    # Initialize Seq2SeqTrainer
    trainer = Seq2SeqTrainer(
        model=enhanced_model,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    return enhanced_model


In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from tokenizers import normalizers
from tokenizers.processors import TemplateProcessing

# Example new vocabulary (replace this with your actual vocabulary)
new_vocab = english_entities

# Example of training data: list of sentences (replace with your dataset)
dataset = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_train.parquet")
english_texts = dataset["english"].tolist()
corpus = english_texts

# Initialize a tokenizer model (e.g., BPE or WordPiece)
tokenizer = transformers.MarianTokenizer.from_pretrained(model_name)

# Set normalizer to handle case sensitivity (you can modify as needed)
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])

# Pre-tokenizer to split text into words
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Add new vocabulary to tokenizer
# Make sure to add <unk> for unknown tokens and any special tokens you want
tokenizer.add_special_tokens(new_vocab)

# Initialize a trainer for the tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=65001+5000,  # Set a desired vocab size
)

# Train the tokenizer using the corpus (you may also use a large dataset here)
tokenizer.train_from_iterator(corpus, trainer=trainer)

# Save the trained tokenizer
tokenizer.save("my_tokenizer.json")

# Load and test the tokenizer
tokenizer = Tokenizer.from_file("my_tokenizer.json")

# Example sentence to tokenize
sentence = "COVID is transforming the field of biomedicine."
encoded = tokenizer.encode(sentence)
print("Encoded sentence:", encoded.tokens)

# Decode it back to verify
decoded = tokenizer.decode(encoded.ids)
print("Decoded sentence:", decoded)


In [None]:
# Main execution
# Load pretrained MarianMT model
model_name = "Helsinki-NLP/opus-mt-en-zh"
base_model = transformers.MarianMTModel.from_pretrained(model_name)

# Add entities as vocab

tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(english_entities))
    })
base_model.resize_token_embeddings(len(tokenizer))


# Load your parallel corpus
dataset = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_train.parquet")
english_texts = dataset["english"].tolist()
chinese_texts = dataset["chinese"].tolist()

# Train the biomedical translation model
enhanced_model = train_biomedical_translation_model(
    base_model,
    tokenizer,
    english_texts,
    chinese_texts,
    biowordvec_path='/content/drive/MyDrive/266 Data Project/corpora/nejm/BioWordVec_PubMed_MIMICIII_d200.vec.bin',
    num_train_epochs=3
)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.8567,0.824842


Epoch,Training Loss,Validation Loss
1,0.8567,0.824842
2,0.4964,0.459455
3,0.4013,0.405166


In [None]:
save_dir = "/content/drive/MyDrive/266 Data Project/corpora/nejm/word-vec-model-ner"
enhanced_model.save_custom(save_dir, tokenizer)



In [None]:
from evaluate import load

def evaluate_model_metrics(predictions, references, save_path=None):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)

    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)

    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)

    # Extract summary statistics for BERTScore
    bertscore_summary = {
        "mean": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
        "median": sorted(bertscore_result["f1"])[len(bertscore_result["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))**2 for x in bertscore_result["f1"]) / len(bertscore_result["f1"]))**0.5
    }

    # Consolidate results
    results = {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_summary,
        "TER": ter_result,
    }

    return results

In [None]:
class BiomedicalTranslationEvaluator:
    """
    Evaluate the performance of a biomedical translation model.
    """
    def __init__(self, model, tokenizer, device=None):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to the specified device
        self.model.to(self.device)

    def prepare_dataset(self, english_texts, chinese_texts, max_length=512):
        """
        Prepare a dataset for evaluation.
        """
        # Tokenize source (English) texts
        source_encodings = self.tokenizer(
            english_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Tokenize target (Chinese) texts for comparison (optional)
        target_encodings = self.tokenizer(
            chinese_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Move all tensors to the appropriate device
        return {
            "source_input_ids": source_encodings["input_ids"].to(self.device),
            "source_attention_mask": source_encodings["attention_mask"].to(self.device),
            "target_input_ids": target_encodings["input_ids"].to(self.device)
        }

    def generate_translations(self, dataset, batch_size=16):
        translations = []
        for i in trange(0, len(dataset["source_input_ids"]), batch_size):
            batch_input_ids = dataset["source_input_ids"][i:i + batch_size].to(self.device)
            batch_attention_mask = dataset["source_attention_mask"][i:i + batch_size].to(self.device)

            # Generate translations for the batch
            outputs = self.model.generate(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                num_beams=3,
                max_length=128,  # Adjust if needed
            )
            translations.extend(self.tokenizer.batch_decode(outputs, skip_special_tokens=True))

        return translations


    def run_evaluation(self, english_texts, chinese_texts):
        """
        Run the evaluation process.
        """
        # Prepare dataset
        dataset = self.prepare_dataset(english_texts, chinese_texts)

        # Generate translations
        translations = self.generate_translations(dataset)

        # Decode target inputs for human-readable comparison
        # target_texts = self.tokenizer.batch_decode(
        #     dataset["target_input_ids"].to("cpu"), skip_special_tokens=True
        # )

        return {
            "translations": translations,
            "targets": chinese_texts
        }


In [None]:
# Initialize the evaluator
evaluator = BiomedicalTranslationEvaluator(
    enhanced_model,
    tokenizer
)

test_dataset = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_test.parquet")
english_test_texts = test_dataset["english"].tolist()
chinese_test_texts = test_dataset["chinese"].tolist()

torch.cuda.empty_cache()
# Run evaluation
results = evaluator.run_evaluation(
    english_test_texts,  # List of English sentences
    chinese_test_texts   # List of Chinese reference translations
)

# Print results
print("Translations:")
print(results["translations"][0:5])
print("\nTargets:")
print(results["targets"][0:5])


  0%|          | 0/132 [00:00<?, ?it/s]

Translations:
['是 一种    它 在', '和   的', '目前 尚 不 知晓     患者 的  和', '在 这项 为期 年 的 剂量  研究 中 我们 纳入 了 例 慢性  患者 和 例    患者 这些 患者 有 至少  之前 接受 过     治疗 (   ) 的  或 不可 接受 的', '主要 目的 是 确定 最大  剂量 或 推荐 的  剂量 ( 或 两者 )']

Targets:
['asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构抑制 剂 , 它 可 通过 不同于 所有 其他 ABL 激酶 抑制剂 的 机制 将 BCR - ABL1 锁定 在 非 活性 构象 .', 'asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括 看门 基因 ( gatekeeper ) T315I 突变体 .', 'asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性 尚未 明确 .', '在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 加速 期 慢性 髓系 白血病 ( CML ) 患者 , 这些 患者 既往 对 至少 两种 ATP 竞争性 酪氨酸 激酶 抑制剂 ( TKI ) 耐药 或 发生 不可 接受 的 副作用 .', '本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量 ( 或 这 两者 ) .']


In [None]:
evaluate_model_metrics(results["translations"], results["targets"])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': {'bleu': 0.08483225657980646,
  'precisions': [0.6239937748202211,
   0.24910455398260276,
   0.11569124841456786,
   0.0553249010331177],
  'brevity_penalty': 0.4776560140919169,
  'length_ratio': 0.5750879575334856,
  'translation_length': 37268,
  'reference_length': 64804},
 'ROUGE': {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0},
 'BERTScore': {'mean': 0.8898050120396347,
  'median': 0.8951306939125061,
  'std': 0.07368598900462432},
 'TER': {'score': 70.41777084957131,
  'num_edits': 45173,
  'ref_length': 64150.0}}

In [None]:
pd.DataFrame({"predicted_chinese": results["translations"], "chinese": results["targets"]}).to_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/en_to_zh_cleaned_embeddings_output.parquet")