In [1]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset, load_dataset
import torch.nn.functional as F
import os
import json
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

def load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens):
    # Load tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(special_tokens))
    })

    # Load base model
    model = MarianMTModel.from_pretrained(model_name)
    
    # Create biomedical encoder
    special_token_size = len(special_tokens)
    biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
    
    # Create custom model
    custom_model = CustomMarianMTModel(
        model.config,
        biomedical_encoder,
        hidden_size, 
        special_token_size, 
    )

    # Resize token embeddings
    custom_model.resize_token_embeddings(len(tokenizer))
    
    return tokenizer, custom_model

class BiomedicalEncoder(nn.Module):
    def __init__(self, hidden_size, special_token_size):
        super(BiomedicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size
        
        # Adjust the linear layer to match input dimensions
        self.linear = nn.Linear(special_token_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, entity_embeddings):
        # Reshape entity embeddings if necessary
        original_shape = entity_embeddings.shape
        
        # Flatten the tensor if it has more than 2 dimensions
        if len(original_shape) > 2:
            entity_embeddings = entity_embeddings.view(-1, original_shape[-1])
        
        # Ensure the input matches the expected dimension
        if entity_embeddings.size(1) != self.special_token_size:
            # If the input doesn't match, pad or truncate
            if entity_embeddings.size(1) < self.special_token_size:
                # Pad with zeros
                padding = torch.zeros(
                    entity_embeddings.size(0), 
                    self.special_token_size - entity_embeddings.size(1),
                    device=entity_embeddings.device
                )
                entity_embeddings = torch.cat([entity_embeddings, padding], dim=1)
            else:
                # Truncate
                entity_embeddings = entity_embeddings[:, :self.special_token_size]
        
        # Apply linear transformation and activation
        encoded = self.linear(entity_embeddings)
        return self.activation(encoded)

class CustomMarianMTModel(MarianMTModel):
    def __init__(self, config, hidden_size=512, special_token_size=206573, biomedicalEncoder=None):
        super().__init__(config)
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Initialize biomedical encoder within the model
        if biomedicalEncoder == None:
            self.biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
        else:
            self.biomedical_encoder = biomedicalEncoder

        # Entity embedding for special tokens
        self.entity_embedding = nn.Embedding(special_token_size + 1, hidden_size)  # +1 for padding token

        # Projection layer to match vocabulary size
        self.entity_projection = nn.Linear(hidden_size, config.vocab_size)

    def save_custom(self, save_directory):
        # Create save directory if it doesn't exist
        os.makedirs(save_directory, exist_ok=True)

        # Save the model and its configuration
        self.save_pretrained(save_directory)

        # Save the biomedical encoder's state_dict
        torch.save(self.biomedical_encoder.state_dict(), os.path.join(save_directory, "biomedical_encoder.pth"))

        # Save custom attributes in a JSON file
        custom_config = {
            "hidden_size": self.hidden_size,
            "special_token_size": self.special_token_size,
        }
        with open(os.path.join(save_directory, "custom_config.json"), "w") as f:
            json.dump(custom_config, f)

    @classmethod
    def from_custom(cls, save_directory):
        # Load base model
        model = MarianMTModel.from_pretrained(save_directory)

        # Load custom attributes from JSON
        custom_config_path = os.path.join(save_directory, "custom_config.json")
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Load biomedical encoder state_dict
        biomedical_encoder_path = os.path.join(save_directory, "biomedical_encoder.pth")
        biomedicalEncoder = BiomedicalEncoder(custom_config["hidden_size"], custom_config["special_token_size"]).load_state_dict(torch.load(biomedical_encoder_path))

        model.hidden_size = custom_config["hidden_size"]
        model.special_token_size = custom_config["special_token_size"]

        new_model = CustomMarianMTModel(
            config = model.config, 
            hidden_size=model.hidden_size,
            special_token_size=model.special_token_size,
            biomedicalEncoder=biomedicalEncoder
            )
        
        # Load the MarianMT model weights into the custom model
        new_model.load_state_dict(model.state_dict(), strict=False)

        tokenizer = MarianTokenizer.from_pretrained(save_directory)

        return new_model, tokenizer

    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
        # Perform base MarianMT forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

        # Process entity information if provided
        if entity_ids is not None:
            try:
                # Ensure entity_ids is a tensor with 2 dimensions
                if len(entity_ids.shape) == 1:
                    entity_ids = entity_ids.unsqueeze(0)

                # Get batch size, sequence length, and vocab size from outputs
                batch_size = outputs.logits.size(0)
                sequence_length = outputs.logits.size(1)
                vocab_size = outputs.logits.size(2)

                # Ensure entity_ids is on the same device as outputs.logits
                entity_ids = entity_ids.to(outputs.logits.device)

                # Limit entity_ids to current batch size
                entity_ids = entity_ids[:batch_size]

                if torch.any(entity_ids >= self.entity_embedding.num_embeddings):
                    print(f"Invalid entity IDs detected: {entity_ids}")
                    raise ValueError("Entity IDs are out of bounds for the embedding layer")


                # Get embeddings for entity special tokens
                entity_embeddings = self.entity_embedding(entity_ids)

                # Ensure embeddings are on the correct device
                entity_embeddings = entity_embeddings.to(outputs.logits.device)

                # Process through biomedical encoder
                original_shape = entity_embeddings.shape
                entity_features = self.biomedical_encoder(entity_embeddings.view(-1, original_shape[-1]))

                # Reshape back to original batch and entity dimension
                entity_features = entity_features.view(original_shape[0], original_shape[1], -1)

                # Project entity features to match logits dimensionality
                entity_logits = self.entity_projection(entity_features)

                # Ensure logits are on the correct device
                entity_logits = entity_logits.to(outputs.logits.device)

                # Create a tensor of zeros with the same shape as outputs.logits
                expanded_entity_logits = torch.zeros_like(outputs.logits)

                # Adjust logits shape to match the entity length
                min_entities = min(entity_logits.size(1), expanded_entity_logits.size(1))
                min_vocab = min(entity_logits.size(2), expanded_entity_logits.size(2))

                expanded_entity_logits[:, :min_entities, :min_vocab] = entity_logits[:, :min_entities, :min_vocab]

                # Add entity-based logits to original logits
                outputs.logits = outputs.logits + expanded_entity_logits

            except Exception as e:
                print(f"Error in forward method: {e}")
                raise
        torch.cuda.synchronize()
        return outputs



def prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english", max_entities=5):
    def preprocess_function(examples):
        # Ensure inputs are lists
        src_sentences = examples[src_lang]
        tgt_sentences = examples[tgt_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])

        processed_src = []
        processed_entities = []
        
        for sentence, entities in zip(src_sentences, entities_list):
            # Ensure sentence is a string and remove existing spaces
            sentence = str(sentence).replace(" ", "")
            
            # Add special tokens for entities
            for entity in entities:
                sentence = sentence.replace(entity, f"<<{entity}>>")
            
            processed_src.append(sentence)
            
            # Convert entities to token IDs
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>") 
                for entity in entities
            ]
            
            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))
            
            # Debugging: Log entity ids and padding
            # print(f"Entity IDs (after padding/truncation): {entity_ids}")

            processed_entities.append(entity_ids)

        # Tokenize source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Tokenize target sentences
        labels = tokenizer(
            tgt_sentences,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]
        
        # Convert entity_ids to tensor
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)
        
        return model_inputs

    # Apply preprocessing to the dataset
    processed_dataset = dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=dataset.column_names
    )

    return processed_dataset

def fine_tune_custom_model(custom_model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        save_safetensors=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
        fp16=False  # Disable mixed precision
    )

    # Create trainer
    trainer = Seq2SeqTrainer(
        model=custom_model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
model_name = "Helsinki-NLP/opus-mt-zh-en"
output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2"
hidden_size = 512

# Load named entities
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]
special_token_size = len(special_tokens)

# Load tokenizer and custom model
tokenizer, custom_model = load_marian_with_biomedical_layer(
    model_name, 
    hidden_size, 
    special_tokens
)


TypeError: empty(): argument 'size' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got BiomedicalEncoder"

In [1]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset, load_dataset
import torch.nn.functional as F
import os
import json
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

def load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens):
    # Load tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(special_tokens))
    })

    # Load base model
    model = MarianMTModel.from_pretrained(model_name)
    
    # Create custom model, CustomMarianMTModel will create a BiomedicalEncoder object in init()
    custom_model = CustomMarianMTModel(
        config=model.config,
        hidden_size=hidden_size,
        special_token_size=len(special_tokens), 
    )


    # Resize token embeddings
    custom_model.resize_token_embeddings(len(tokenizer))
    
    return tokenizer, custom_model

class BiomedicalEncoder(nn.Module):
    def __init__(self, hidden_size, special_token_size):
        super(BiomedicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size
        
        # Adjust the linear layer to match input dimensions
        self.linear = nn.Linear(special_token_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, entity_embeddings):
        # Reshape entity embeddings if necessary
        original_shape = entity_embeddings.shape
        
        # Flatten the tensor if it has more than 2 dimensions
        if len(original_shape) > 2:
            entity_embeddings = entity_embeddings.view(-1, original_shape[-1])
        
        # Ensure the input matches the expected dimension
        if entity_embeddings.size(1) != self.special_token_size:
            # If the input doesn't match, pad or truncate
            if entity_embeddings.size(1) < self.special_token_size:
                # Pad with zeros
                padding = torch.zeros(
                    entity_embeddings.size(0), 
                    self.special_token_size - entity_embeddings.size(1),
                    device=entity_embeddings.device
                )
                entity_embeddings = torch.cat([entity_embeddings, padding], dim=1)
            else:
                # Truncate
                entity_embeddings = entity_embeddings[:, :self.special_token_size]
        
        # Apply linear transformation and activation
        encoded = self.linear(entity_embeddings)
        return self.activation(encoded)

class CustomMarianMTModel(MarianMTModel):
    def __init__(self, config, hidden_size=512, special_token_size=206573, biomedicalEncoder=None):
        super().__init__(config)
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Initialize biomedical encoder within the model
        if biomedicalEncoder == None:
            self.biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
        else:
            self.biomedical_encoder = biomedicalEncoder

        # Entity embedding for special tokens
        self.entity_embedding = nn.Embedding(special_token_size + 1, hidden_size)  # +1 for padding token

        # Projection layer to match vocabulary size
        self.entity_projection = nn.Linear(hidden_size, config.vocab_size)

    def save_custom(self, save_directory):
        # Create save directory if it doesn't exist
        os.makedirs(save_directory, exist_ok=True)

        model_save_path = os.path.join(save_directory, "model")
        print(model_save_path)
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        os.makedirs(model_save_path, exist_ok=True)
        os.makedirs(tokenizer_save_path, exist_ok=True)

        # Save the model and its configuration
        self.save_pretrained(model_save_path)

        # Save the biomedical encoder's state_dict
        torch.save(self.biomedical_encoder.state_dict(), os.path.join(model_save_path, "biomedical_encoder.pth"))

        # Save custom attributes in a JSON file
        custom_config = {
            "hidden_size": self.hidden_size,
            "special_token_size": self.special_token_size,
        }
        with open(os.path.join(model_save_path, "custom_config.json"), "w") as f:
            json.dump(custom_config, f)

        if tokenizer is not None:
            tokenizer.save_pretrained(tokenizer_save_path)

    @classmethod
    def from_custom(cls, save_directory):
        model_save_path = os.path.join(save_directory, "model")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load custom attributes from JSON
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Load base model configuration
        model = MarianMTModel.from_pretrained(model_save_path)

        # Create a new CustomMarianMTModel with the loaded configuration
        new_model = cls(
            config=model.config, 
            hidden_size=custom_config["hidden_size"],
            special_token_size=custom_config["special_token_size"]
        )

        # Load the biomedical encoder state dict
        biomedical_encoder_path = os.path.join(model_save_path, "biomedical_encoder.pth")
        biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)
        new_model.biomedical_encoder.load_state_dict(biomedical_encoder_state_dict)

        # Load the main model weights
        state_dict = model.state_dict()
        new_model_state_dict = new_model.state_dict()
        
        # Update the state dictionary, keeping the biomedical encoder weights
        for key, value in state_dict.items():
            if key in new_model_state_dict:
                new_model_state_dict[key] = value
        
        new_model.load_state_dict(new_model_state_dict, strict=False)

        # Load tokenizer
        tokenizer = MarianTokenizer.from_pretrained(tokenizer_save_path)

        return new_model, tokenizer

    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
        # Perform base MarianMT forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

        # Process entity information if provided
        if entity_ids is not None:
            try:
                # Ensure entity_ids is a tensor with 2 dimensions
                if len(entity_ids.shape) == 1:
                    entity_ids = entity_ids.unsqueeze(0)

                # Get batch size, sequence length, and vocab size from outputs
                batch_size = outputs.logits.size(0)
                sequence_length = outputs.logits.size(1)
                vocab_size = outputs.logits.size(2)

                # Ensure entity_ids is on the same device as outputs.logits
                entity_ids = entity_ids.to(outputs.logits.device)

                # Limit entity_ids to current batch size
                entity_ids = entity_ids[:batch_size]

                if torch.any(entity_ids >= self.entity_embedding.num_embeddings):
                    print(f"Invalid entity IDs detected: {entity_ids}")
                    raise ValueError("Entity IDs are out of bounds for the embedding layer")


                # Get embeddings for entity special tokens
                entity_embeddings = self.entity_embedding(entity_ids)

                # Ensure embeddings are on the correct device
                entity_embeddings = entity_embeddings.to(outputs.logits.device)

                # Process through biomedical encoder
                original_shape = entity_embeddings.shape
                entity_features = self.biomedical_encoder(entity_embeddings.view(-1, original_shape[-1]))

                # Reshape back to original batch and entity dimension
                entity_features = entity_features.view(original_shape[0], original_shape[1], -1)

                # Project entity features to match logits dimensionality
                entity_logits = self.entity_projection(entity_features)

                # Ensure logits are on the correct device
                entity_logits = entity_logits.to(outputs.logits.device)

                # Create a tensor of zeros with the same shape as outputs.logits
                expanded_entity_logits = torch.zeros_like(outputs.logits)

                # Adjust logits shape to match the entity length
                min_entities = min(entity_logits.size(1), expanded_entity_logits.size(1))
                min_vocab = min(entity_logits.size(2), expanded_entity_logits.size(2))

                expanded_entity_logits[:, :min_entities, :min_vocab] = entity_logits[:, :min_entities, :min_vocab]

                # Add entity-based logits to original logits
                outputs.logits = outputs.logits + expanded_entity_logits

            except Exception as e:
                print(f"Error in forward method: {e}")
                raise
        torch.cuda.synchronize()
        return outputs



def prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english", max_entities=5):
    def preprocess_function(examples):
        # Ensure inputs are lists
        src_sentences = examples[src_lang]
        tgt_sentences = examples[tgt_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])

        processed_src = []
        processed_entities = []
        
        for sentence, entities in zip(src_sentences, entities_list):
            # Ensure sentence is a string and remove existing spaces
            sentence = str(sentence).replace(" ", "")
            
            # Add special tokens for entities
            for entity in entities:
                sentence = sentence.replace(entity, f"<<{entity}>>")
            
            processed_src.append(sentence)
            
            # Convert entities to token IDs
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>") 
                for entity in entities
            ]
            
            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))
            
            # Debugging: Log entity ids and padding
            # print(f"Entity IDs (after padding/truncation): {entity_ids}")

            processed_entities.append(entity_ids)

        # Tokenize source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Tokenize target sentences
        labels = tokenizer(
            tgt_sentences,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]
        
        # Convert entity_ids to tensor
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)
        
        return model_inputs

    # Apply preprocessing to the dataset
    processed_dataset = dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=dataset.column_names
    )

    return processed_dataset

def fine_tune_custom_model(custom_model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        save_safetensors=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
        fp16=False  # Disable mixed precision
    )

    # Create trainer
    trainer = Seq2SeqTrainer(
        model=custom_model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\before-training"
# Loading custom model
custom_model_base, tokenizer_base = CustomMarianMTModel.from_custom(model_save_path)

  biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)


In [None]:
from evaluate import load
import torch

def add_special_tokens(tokenizer, entities):
    """
    Adds new entity tokens to the tokenizer if they are not already present.
    
    Args:
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to update.
        entities (list of str): List of entity names to add as special tokens.
        
    Returns:
        None
    """
    special_tokens = [f"<<{entity}>>" for entity in entities]
    added_tokens = [token for token in special_tokens if token not in tokenizer.get_vocab()]
    if added_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': added_tokens})
        print(f"Added new special tokens: {added_tokens}")


def translate_tokenized_dataset(model, tokenizer, tokenized_dataset, batch_size=32):
    translations = []
    
    model.eval()

    for i in range(0, len(tokenized_dataset), batch_size):
        # Extract batch data
        input_ids = tokenized_dataset["input_ids"][i:i + batch_size]
        attention_mask = tokenized_dataset["attention_mask"][i:i + batch_size]
        entity_ids = tokenized_dataset["entity_ids"][i:i + batch_size]

        # Convert to tensors with explicit type and device handling
        input_ids = torch.tensor(input_ids, dtype=torch.long).to(model.device)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(model.device)
        entity_ids = torch.tensor(entity_ids, dtype=torch.long).to(model.device)

        # Debug print statements
        print(f"Batch {i//batch_size + 1}:")
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention Mask shape: {attention_mask.shape}")
        print(f"Entity IDs shape: {entity_ids.shape}")
        print(f"Entity IDs min: {entity_ids.min()}, max: {entity_ids.max()}")
        print(f"Model entity embedding size: {model.entity_embedding.num_embeddings}")

        # Validate entity_ids before generation
        try:
            # Check if all entity IDs are within the valid range
            assert torch.all(entity_ids >= 0), "Negative entity IDs found"
            assert torch.all(entity_ids < model.entity_embedding.num_embeddings), "Out-of-bound entity IDs"
        except AssertionError as e:
            print(f"Entity ID validation error: {e}")
            # Skip this batch or handle the error as needed
            continue

        # Generate translations
        try:
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    entity_ids=entity_ids
                )
        except Exception as e:
            print(f"Generation error in batch {i//batch_size + 1}: {e}")
            continue

        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)

    return translations


# Define evaluation metrics
def evaluate_model(predictions, references):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    # The expected format for BLEU, ROUGE, and BERTScore is a list of lists of strings
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    
    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)
    
    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
    
    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)
    
    return {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_result,
        "TER": ter_result,
    }


In [None]:
def preprocess_test_data(test_dataset, tokenizer, src_lang="chinese", max_entities=5):
    """
    Preprocess test data to tokenize inputs and add entity_ids for entity-based embeddings.
    """
    def preprocess_function(examples):
        src_sentences = examples[src_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])
        
        processed_src = []
        processed_entities = []
        
        for sentence, entities in zip(src_sentences, entities_list):
            # Process source sentence (add markers for entities in vocabulary)
            for entity in entities:
                if f"<<{entity}>>" in tokenizer.get_vocab():
                    sentence = sentence.replace(entity, f"<<{entity}>>")
            processed_src.append(sentence)
            
            # Convert entities to token IDs (if in vocab)
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>") 
                if f"<<{entity}>>" in tokenizer.get_vocab() else 0
                for entity in entities
            ]
            
            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))  # Pad with zeros
            processed_entities.append(entity_ids)
        
        # Tokenize the processed source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )
        
        # Add entity_ids as a tensor to the inputs
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)
        
        return model_inputs
    
    # Apply the preprocessing function to the test dataset
    processed_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
    )
    
    return processed_test_dataset


def evaluate_model(model, tokenized_test_dataset, tokenizer, batch_size=16):
    """
    Evaluate the model on the tokenized test dataset.
    """
    # Prepare DataLoader for test data
    test_loader = torch.utils.data.DataLoader(
        tokenized_test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: tokenizer.pad(batch, return_tensors="pt")
    )
    
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            # Move inputs to GPU if available
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
            
            # Generate predictions
            outputs = model.generate(**inputs)
            predictions.extend(outputs)
    
    return predictions


In [3]:
from datasets import load_dataset 
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_test_entities.parquet"})["test"]

In [None]:
import pandas as pd
data = pd.read_parquet("nejm/nejm_test_entities.parquet")

: 

In [4]:
test_dataset[0]

{'chinese': 'asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构抑制 剂 , 它 可 通过 不同于 所有 其他 ABL 激酶 抑制剂 的 机制 将 BCR - ABL1 锁定 在 非 活性 构象 .',
 'english': 'asciminib is an allosteric inhibitor that binds a myristoyl site of the BCR @-@ ABL1 protein , locking BCR @-@ ABL1 into an inactive conformation through a mechanism distinct from those for all other ABL kinase inhibitors .',
 'entities': ['as##ci##mini##b',
  'bc##r-ab##l##1蛋白',
  '豆蔻酰位点',
  '抑制剂',
  'ab##l激酶抑制剂',
  'bc##r-ab##l##1']}

In [None]:
# Load your test dataset
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_test_entities.parquet"})["test"]

# Preprocess and tokenize test data
tokenized_test_dataset = preprocess_test_data(test_dataset, tokenizer)

# Evaluate model on test data
predictions = evaluate_model(custom_model, tokenized_test_dataset, tokenizer)
predictions[0]

In [None]:
# # Saving custom model
# model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\before-training"
# custom_model.save_custom(model_save_path)

# # Loading custom model
# custom_model_2, tokenizer_2 = CustomMarianMTModel.from_custom(model_save_path, config=custom_model.config, biomedical_encoder=custom_model.biomedical_encoder, hidden_size=512, special_token_size=special_token_size)


RuntimeError: Error(s) in loading state_dict for CustomMarianMTModel:
	size mismatch for entity_projection.weight: copying a param with shape torch.Size([65001, 512]) from checkpoint, the shape in current model is torch.Size([118379, 512]).
	size mismatch for entity_projection.bias: copying a param with shape torch.Size([65001]) from checkpoint, the shape in current model is torch.Size([118379]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
# Saving custom model
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2\\checkpoint-10485"
custom_model.save_custom(model_save_path)

# Loading custom model
custom_model_2 = CustomMarianMTModel.from_custom(model_save_path)


  biomedicalEncoder = BiomedicalEncoder(custom_config["hidden_size"], custom_config["special_token_size"]).load_state_dict(torch.load(biomedical_encoder_path))


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2\\checkpoint-10485\\biomedical_encoder.pth'

In [18]:
# Saving custom model
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\custom_data_saver-2"
custom_model.save_custom(model_save_path)

# Loading custom model
custom_model_2 = CustomMarianMTModel.from_custom(model_save_path, config=custom_model.config, biomedical_encoder=custom_model.biomedical_encoder, hidden_size=512, special_token_size=special_token_size)


RuntimeError: Error(s) in loading state_dict for CustomMarianMTModel:
	size mismatch for entity_projection.weight: copying a param with shape torch.Size([65001, 512]) from checkpoint, the shape in current model is torch.Size([118379, 512]).
	size mismatch for entity_projection.bias: copying a param with shape torch.Size([65001]) from checkpoint, the shape in current model is torch.Size([118379]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
from transformers import AutoModelForTokenClassification, BertTokenizerFast, pipeline

tokenizer = BertTokenizerFast.from_pretrained('iioSnail/bert-base-chinese-medical-ner')
model = AutoModelForTokenClassification.from_pretrained("iioSnail/bert-base-chinese-medical-ner")

sentences = ["瘦脸针、水光针和玻尿酸详解！", "半月板钙化的病因有哪些？"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True, add_special_tokens=False)
outputs = model(**inputs)
outputs = outputs.logits.argmax(-1) * inputs['attention_mask']

print(outputs)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tensor([[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 4, 4],
        [1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 4, 0, 0]])


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# or import your specific custom model if it's not a standard Hugging Face model

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\pre-training-tokenizer")

# Load the custom model
custom_model = AutoModelForSequenceClassification.from_pretrained(
    "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\pre-training-model"
)

# If you're using your own custom class, use:
# custom_model = YourCustomModel.from_pretrained(...)

# Optionally adjust model's embedding size if needed
custom_model.resize_token_embeddings(len(tokenizer))

# Now the model and tokenizer are ready to use
custom_model.eval()  # Set the model to evaluation mode


In [7]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [15]:
# Apply NER
entities = nlp(sentences[0])

# Extract and display biomedical-related entities
print(entities)

[{'entity': 'M', 'score': np.float32(0.9654682), 'index': 1, 'word': '瘦', 'start': 0, 'end': 1}, {'entity': 'M', 'score': np.float32(0.99966824), 'index': 2, 'word': '脸', 'start': 1, 'end': 2}, {'entity': 'M', 'score': np.float32(0.9951781), 'index': 3, 'word': '针', 'start': 2, 'end': 3}, {'entity': 'M', 'score': np.float32(0.9897831), 'index': 5, 'word': '水', 'start': 4, 'end': 5}, {'entity': 'M', 'score': np.float32(0.99957424), 'index': 6, 'word': '光', 'start': 5, 'end': 6}, {'entity': 'M', 'score': np.float32(0.9946596), 'index': 7, 'word': '针', 'start': 6, 'end': 7}, {'entity': 'M', 'score': np.float32(0.98533744), 'index': 9, 'word': '玻', 'start': 8, 'end': 9}, {'entity': 'M', 'score': np.float32(0.99989486), 'index': 10, 'word': '尿', 'start': 9, 'end': 10}, {'entity': 'M', 'score': np.float32(0.9892781), 'index': 11, 'word': '酸', 'start': 10, 'end': 11}]


In [1]:
# https://huggingface.co/lixin12345/chinese-medical-ner
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

class NER:
    """
    实体命名实体识别
    """
    def __init__(self,model_path) -> None:
        """
        Args:
            model_path:模型地址
        """

        self.model_path = model_path
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)

    def ner(self,sentence:str) -> list:
        """
        命名实体识别
        Args:
            sentence:要识别的句子
        Return:
            实体列表:[{'type':'LOC','tokens':[...]},...]
        """
        ans = []
        for i in range(0,len(sentence),500):
            ans = ans + self._ner(sentence[i:i+500])
        return ans
    
    def _ner(self,sentence:str) -> list:
        if len(sentence) == 0: return []
        inputs = self.tokenizer(
            sentence, add_special_tokens=True, return_tensors="pt"
        )
        
        if torch.cuda.is_available():
            self.model = self.model.to(torch.device('cuda:0'))
            for key in inputs:
                inputs[key] = inputs[key].to(torch.device('cuda:0'))
            
        with torch.no_grad():
            logits = self.model(**inputs).logits
        predicted_token_class_ids = logits.argmax(-1)
        predicted_tokens_classes = [self.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        entities = []
        entity = {}
        for idx, token in enumerate(self.tokenizer.tokenize(sentence,add_special_tokens=True)):
            if 'B-' in predicted_tokens_classes[idx] or 'S-' in predicted_tokens_classes[idx]:
                if len(entity) != 0:
                    entities.append(entity)
                entity = {}
                entity['type'] = predicted_tokens_classes[idx].replace('B-','').replace('S-','')
                entity['tokens'] = [token]
            elif 'I-' in predicted_tokens_classes[idx] or 'E-' in predicted_tokens_classes[idx] or 'M-' in predicted_tokens_classes[idx]:
                if len(entity) == 0:
                    entity['type'] = predicted_tokens_classes[idx].replace('I-','').replace('E-','').replace('M-','')
                    entity['tokens'] = []
                entity['tokens'].append(token)
            else:
                if len(entity) != 0:
                    entities.append(entity)
                    entity = {}
        if len(entity) > 0:
            entities.append(entity)
        return entities

ner_model = NER('lixin12345/chinese-medical-ner')
text = """
患者既往慢阻肺多年;冠心病史6年，平素规律服用心可舒、保心丸等控制可;双下肢静脉血栓3年，保守治疗效果可;左侧腹股沟斜疝无张力修补术后2年。否认"高血压、糖尿病"等慢性病病史，否认"肝炎、结核"等传染病病史及其密切接触史，否认其他手术、重大外伤、输血史，否认"食物、药物、其他"等过敏史，预防接种史随社会。
"""
ans = ner_model.ner(text)

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import pandas as pd
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\en-entities.parquet")
df

Unnamed: 0,entity_group,score,word,start,end
0,Diagnostic_procedure,0.665001,body mass,73,83
1,Sign_symptom,0.841736,loss of muscle mass,15,35
2,Lab_value,0.456666,58,38,41
3,Diagnostic_procedure,0.208239,blood pressure,12,27
4,Clinical_event,0.731938,discharge,39,49
...,...,...,...,...,...
237137,Lab_value,0.765173,6 %,185,189
237138,Detailed_description,0.278184,AC,25,28
237139,Detailed_description,0.173570,CORD,28,32
237140,Detailed_description,0.522833,multiple combinations of,108,133


In [35]:
grouped = df[(df.word != '') & (df.word != '-') & df.entity_group != "Lab_value"][["word", "score"]].groupby(by=['word']).max().reset_index()

In [36]:
grouped

Unnamed: 0,word,score
0,,0.991196
1,$,0.604560
2,$ 7000,0.216629
3,%,0.978267
4,% -40 %,0.929531
...,...,...
45064,≥ 85th percentile,0.848104
45065,≥ 93 %,0.887354
45066,≥ 94 %,0.653164
45067,≥ 95th percentile,0.942542


In [30]:
grouped.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\en-entities-filtered.parquet")

In [23]:
grouped['word'].iloc[0]

''

In [14]:
import pandas as pd
data = pd.read_parquet("/Users/linaa/Github/2024-fall-assignment-linaaron88/project/biomedical_translation/dataset/processed/en-entities.parquet")
data.groupby(["entity_group", "word"]).count().reset_index().groupby('entity_group').count().sort_values("word", ascending=False)[['word']]

Unnamed: 0_level_0,word
entity_group,Unnamed: 1_level_1
Detailed_description,14676
Diagnostic_procedure,9421
Lab_value,5498
Sign_symptom,4932
Medication,4268
Date,3496
Biological_structure,3050
Therapeutic_procedure,2497
Disease_disorder,2058
History,1895


In [9]:
len(df.word.unique())

45069

In [5]:
df.score.describe()

count    237142.000000
mean          0.605836
std           0.268186
min           0.033175
25%           0.385944
50%           0.623573
75%           0.855455
max           0.995914
Name: score, dtype: float64

In [6]:
df[df.score < 0.385944]

Unnamed: 0,entity_group,score,word,start,end
3,Diagnostic_procedure,0.208239,blood pressure,12,27
5,Biological_structure,0.126530,older,52,58
6,Age,0.096632,adults,58,65
12,Sign_symptom,0.370513,BP,25,28
15,Coreference,0.101418,groups,92,99
...,...,...,...,...,...
237113,Medication,0.320418,insulin,168,176
237114,Medication,0.377873,-,149,150
237126,Diagnostic_procedure,0.314685,infarction,286,297
237138,Detailed_description,0.278184,AC,25,28


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
texts = df.chinese.tolist()
# Extract entities
chinese_entities = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = ner_model.ner(text)
    for entity in entities:
        # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
        chinese_entities.append(entity)
df_entities = pd.DataFrame(chinese_entities)
df_entities.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")

100%|██████████| 62127/62127 [12:27<00:00, 83.14it/s]


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet")
texts = df.chinese.tolist()
# Extract entities
chinese_entities = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = ner_model.ner(text)
    for entity in entities:
        # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
        chinese_entities.append(entity)
df_entities = pd.DataFrame(chinese_entities)
df_entities.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-test-entities.parquet")

100%|██████████| 2102/2102 [00:26<00:00, 78.68it/s]


In [30]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
texts = df.chinese.tolist()
# Extract entities
chinese_entities_per_sentence = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = ner_model.ner(text)
    chinese_entities_per_sentence.append(entities)

df_entities_sentence_zh = pd.DataFrame(chinese_entities_per_sentence)
df_entities_sentence_zh.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities-per-sentence.parquet")

100%|██████████| 62127/62127 [12:22<00:00, 83.64it/s]  


In [2]:
import pandas as pd
pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")

Unnamed: 0,type,tokens
0,TreatmentOrPreventionProcedures,"[激, 素, 疗, 法]"
1,TreatmentOrPreventionProcedures,"[去, 脂, 体, 重]"
2,DiseaseNameOrComprehensiveCertificate,"[衰, 老]"
3,Symptom,"[肌, 肉, 萎, 缩]"
4,Symptom,"[活, 动, 能, 力, 下, 降]"
...,...,...
206568,Symptom,"[死, 亡]"
206569,MedicalTestingItems,"[糖, 化, 血, 红, 蛋, 白]"
206570,InspectionProcedure,"[ac, ##cord]"
206571,TreatmentOrPreventionProcedures,"[降, 糖, 方, 法]"


In [26]:
import pandas as pd
df_entities_sentence_zh = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities-per-sentence.parquet")

In [3]:
df_entities_sentence_zh

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,"{'tokens': ['激', '素', '疗', '法'], 'type': 'Trea...","{'tokens': ['去', '脂', '体', '重'], 'type': 'Trea...",,,,,,,,,...,,,,,,,,,,
1,"{'tokens': ['衰', '老'], 'type': 'DiseaseNameOrC...","{'tokens': ['肌', '肉', '萎', '缩'], 'type': 'Symp...","{'tokens': ['活', '动', '能', '力', '下', '降'], 'ty...",,,,,,,,...,,,,,,,,,,
2,"{'tokens': ['去', '脂', '体', '重'], 'type': 'Medi...","{'tokens': ['绝', '经', '期', '激', '素', '疗', '法']...",,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,"{'tokens': ['院'], 'type': 'MedicalProcedures'}","{'tokens': ['血', '压'], 'type': 'MedicalTesting...","{'tokens': ['再', '入', '院'], 'type': 'MedicalPr...","{'tokens': ['严', '重', '不', '良', '事', '件'], 'ty...",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62122,"{'tokens': ['糖', '尿', '病'], 'type': 'DiseaseNa...","{'tokens': ['强', '化', '降', '糖', '治', '疗'], 'ty...",,,,,,,,,...,,,,,,,,,,
62123,"{'tokens': ['2', '型', '糖', '尿', '病'], 'type': ...","{'tokens': ['正', '常', '血', '糖', '水'], 'type': ...","{'tokens': ['糖', '化', '血', '红', '蛋', '白', '水',...","{'tokens': ['数'], 'type': 'MedicalTestingItems'}","{'tokens': ['心', '血', '管'], 'type': 'BodyParts'}","{'tokens': ['心', '梗'], 'type': 'DiseaseNameOrC...","{'tokens': ['死', '亡', '率'], 'type': 'Symptom'}",,,,...,,,,,,,,,,
62124,"{'tokens': ['心', '血', '管', '疾', '病'], 'type': ...","{'tokens': ['糖', '尿', '病'], 'type': 'DiseaseNa...","{'tokens': ['血', '压'], 'type': 'MedicalTesting...","{'tokens': ['血', '脂'], 'type': 'MedicalTesting...","{'tokens': ['降', '糖', '药', '物'], 'type': 'Drug'}","{'tokens': ['糖', '化', '血', '红', '蛋', '白'], 'ty...","{'tokens': ['糖', '化', '血', '红', '蛋', '白'], 'ty...","{'tokens': ['死', '亡'], 'type': 'Symptom'}",,,...,,,,,,,,,,
62125,"{'tokens': ['强', '化', '治', '疗'], 'type': 'Trea...","{'tokens': ['死', '亡'], 'type': 'Symptom'}","{'tokens': ['心', '血', '管'], 'type': 'BodyParts'}","{'tokens': ['死', '亡'], 'type': 'Symptom'}","{'tokens': ['糖', '化', '血', '红', '蛋', '白'], 'ty...",,,,,,...,,,,,,,,,,


In [32]:
df_entities_sentence_zh.to_dict(orient="records")

[{0: {'type': 'TreatmentOrPreventionProcedures',
   'tokens': ['激', '素', '疗', '法']},
  1: {'type': 'TreatmentOrPreventionProcedures',
   'tokens': ['去', '脂', '体', '重']},
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None,
  9: None,
  10: None,
  11: None,
  12: None,
  13: None,
  14: None,
  15: None,
  16: None,
  17: None,
  18: None,
  19: None,
  20: None,
  21: None,
  22: None,
  23: None,
  24: None,
  25: None,
  26: None,
  27: None,
  28: None,
  29: None,
  30: None,
  31: None,
  32: None,
  33: None,
  34: None,
  35: None},
 {0: {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['衰', '老']},
  1: {'type': 'Symptom', 'tokens': ['肌', '肉', '萎', '缩']},
  2: {'type': 'Symptom', 'tokens': ['活', '动', '能', '力', '下', '降']},
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None,
  9: None,
  10: None,
  11: None,
  12: None,
  13: None,
  14: None,
  15: None,
  16: None,
  17: None,
  18: None,
  19: None,
  20: None,
  21: None,
  22

In [27]:
import tqdm
def get_entities(df):
    row_list = []
    for row in tqdm.tqdm(df.to_dict(orient="records")):
        entity_list = []
        for k, v in row.items():
            if v != None:
                if v.get("tokens") is not None:
                    entity = "".join(v["tokens"])
                    entity_list.append(entity)
        row_list.append([*entity_list])
    new_df = pd.DataFrame()
    new_df["entities"] = row_list
    return new_df


In [None]:
train_entities_by_row = get_entities(df_entities_sentence_zh)

100%|██████████| 62127/62127 [00:00<00:00, 114599.74it/s]


In [30]:
df_train = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
df_train.head()

Unnamed: 0,chinese,english
0,也许 不能 : 分析 结果 提示 激素 疗法 在 维持 去 脂 体重 方面 作用 很小 .,probably not : analysis suggests minimal effec...
1,与 衰老 相关 的 肌肉 萎缩 是 活动 能力 下降 的 重要 预测 因素 .,age @-@ related loss of muscle mass ( sarcopen...
2,研究者 确定 了 12 项 评估 去 脂 体重 变化 情况 的 绝经期 激素 疗法 随机 试验 .,investigators identified 12 randomized trials ...
3,"4 , 474 名 参与者 的 平均年龄 为 58 岁 , 平均 随访 时间 为 2 年 .","among 4474 participants , mean age was 58 and ..."
4,强化 老年人 出院 时 的 血压 控制 方案 与 再 入院 和 严重 不良 事件 增加 相关 .,Intensifying blood pressure regimens at discha...


In [31]:
df_train["entities"] = train_entities_by_row["entities"]
df_train

Unnamed: 0,chinese,english,entities
0,也许 不能 : 分析 结果 提示 激素 疗法 在 维持 去 脂 体重 方面 作用 很小 .,probably not : analysis suggests minimal effec...,"[激素疗法, 去脂体重]"
1,与 衰老 相关 的 肌肉 萎缩 是 活动 能力 下降 的 重要 预测 因素 .,age @-@ related loss of muscle mass ( sarcopen...,"[衰老, 肌肉萎缩, 活动能力下降]"
2,研究者 确定 了 12 项 评估 去 脂 体重 变化 情况 的 绝经期 激素 疗法 随机 试验 .,investigators identified 12 randomized trials ...,"[去脂体重, 绝经期激素疗法]"
3,"4 , 474 名 参与者 的 平均年龄 为 58 岁 , 平均 随访 时间 为 2 年 .","among 4474 participants , mean age was 58 and ...",[]
4,强化 老年人 出院 时 的 血压 控制 方案 与 再 入院 和 严重 不良 事件 增加 相关 .,Intensifying blood pressure regimens at discha...,"[院, 血压, 再入院, 严重不良事件]"
...,...,...,...
62122,"最后 , 新 诊断 的 糖尿病 患者 对 强化 降糖 治疗 的 反应 不同 .","finally , people with newly diagnosed diabetes...","[糖尿病, 强化降糖治疗]"
62123,"一项 涉及 新 诊断 的 2 型 糖尿病 人群 的 大型 研究 中 , 目标 为 正常 血糖...",a large trial involving people with newly diag...,"[2型糖尿病, 正常血糖水, 糖化血红蛋白水平中, 数, 心血管, 心梗, 死亡率]"
62124,"总之 , ACCORD 研究 结果显示 对于 心血管 疾病 高 危且 控制 不 理想 , 长...","in summary , the results of the ACCORD trial s...","[心血管疾病, 糖尿病, 血压, 血脂, 降糖药物, 糖化血红蛋白, 糖化血红蛋白, 死亡]"
62125,"在 强化 治疗 组中 , 任何 原因 导致 的 死亡 和 心血管 导致 的 死亡 的 风险 ...",the higher risk of death from any cause and fr...,"[强化治疗, 死亡, 心血管, 死亡, 糖化血红蛋白]"


In [32]:
df_train.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train_entities.parquet")

In [29]:
train_entities_by_row

Unnamed: 0,entities
0,"[激素疗法, 去脂体重]"
1,"[衰老, 肌肉萎缩, 活动能力下降]"
2,"[去脂体重, 绝经期激素疗法]"
3,[]
4,"[院, 血压, 再入院, 严重不良事件]"
...,...
62122,"[糖尿病, 强化降糖治疗]"
62123,"[2型糖尿病, 正常血糖水, 糖化血红蛋白水平中, 数, 心血管, 心梗, 死亡率]"
62124,"[心血管疾病, 糖尿病, 血压, 血脂, 降糖药物, 糖化血红蛋白, 糖化血红蛋白, 死亡]"
62125,"[强化治疗, 死亡, 心血管, 死亡, 糖化血红蛋白]"


In [35]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet")
texts = df.chinese.tolist()
# Extract entities
chinese_entities_per_sentence = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = ner_model.ner(text)
    chinese_entities_per_sentence.append(entities)

df_entities_sentence_zh_test = pd.DataFrame(chinese_entities_per_sentence)
df_entities_sentence_zh_test.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\test-zh-entities-per-sentence.parquet")

100%|██████████| 2102/2102 [00:34<00:00, 61.78it/s]


In [36]:
df_entities_sentence_zh_test = pd.DataFrame(chinese_entities_per_sentence)
df_entities_sentence_zh_test.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\test-zh-entities-per-sentence.parquet")

In [37]:
df_entities_sentence_zh_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,"{'type': 'Drug', 'tokens': ['as', '##ci', '##m...","{'type': 'MedicalTestingItems', 'tokens': ['bc...","{'type': 'MedicalTestingItems', 'tokens': ['豆'...","{'type': 'Drug', 'tokens': ['抑', '制', '剂']}","{'type': 'Drug', 'tokens': ['ab', '##l', '激', ...","{'type': 'OrganOrCellDamage', 'tokens': ['bc',...",,,,,...,,,,,,,,,,
1,"{'type': 'Microbiology', 'tokens': ['as', '##c...","{'type': 'OrganOrCellDamage', 'tokens': ['bc',...","{'type': 'OrganOrCellDamage', 'tokens': ['看', ...","{'type': 'OrganOrCellDamage', 'tokens': ['ga',...","{'type': 'Microbiology', 'tokens': ['t3', '##1...",,,,,,...,,,,,,,,,,
2,"{'type': 'Drug', 'tokens': ['as', '##ci', '##m...",{'type': 'DiseaseNameOrComprehensiveCertificat...,"{'type': 'TreatmentOrPreventionProcedures', 't...",,,,,,,,...,,,,,,,,,,
3,{'type': 'DiseaseNameOrComprehensiveCertificat...,{'type': 'DiseaseNameOrComprehensiveCertificat...,"{'type': 'Drug', 'tokens': ['atp', '竞', '争', '...",,,,,,,,...,,,,,,,,,,
4,"{'type': 'Drug', 'tokens': ['as', '##ci', '##m...",,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2097,"{'type': 'BodyParts', 'tokens': ['骨', '髓']}","{'type': 'InspectionProcedure', 'tokens': ['基'...",,,,,,,,,...,,,,,,,,,,
2098,{'type': 'DiseaseNameOrComprehensiveCertificat...,,,,,,,,,,...,,,,,,,,,,
2099,"{'type': 'Symptom', 'tokens': ['t', '##n', '##...","{'type': 'Symptom', 'tokens': ['能', '障', '碍']}",{'type': 'DiseaseNameOrComprehensiveCertificat...,"{'type': 'MedicalProcedures', 'tokens': ['体', ...","{'type': 'BodyParts', 'tokens': ['t', '细', '胞']}","{'type': 'MedicalTestingItems', 'tokens': ['γ'...","{'type': 'MedicalTestingItems', 'tokens': ['肿'...",,,,...,,,,,,,,,,
2100,"{'type': 'InspectionProcedure', 'tokens': ['[C...","{'type': 'InspectionProcedure', 'tokens': ['解'...",,,,,,,,,...,,,,,,,,,,


In [13]:
chinese_entities[0]

{'type': 'TreatmentOrPreventionProcedures', 'tokens': ['激', '素', '疗', '法']}

In [38]:
test_entities_by_row = get_entities(df_entities_sentence_zh_test)
test_entities_by_row.head()

100%|██████████| 2102/2102 [00:00<00:00, 300225.67it/s]


Unnamed: 0,entities
0,"[as##ci##mini##b, bc##r-ab##l##1蛋白, 豆蔻酰位点, 抑制剂..."
1,"[as##ci##mini##b, bc##r-ab##l##1, 看门基因, ga##te..."
2,"[as##ci##mini##b, 费城染色体阳性白血病, 抗白血病]"
3,"[慢, 加速期慢性髓系白血病(cm##l), atp竞争性酪氨酸激酶抑制剂(t##ki)]"
4,[as##ci##mini##b]


In [39]:
df_test = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet")
df_test.head()

Unnamed: 0,chinese,english
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ..."
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...


In [40]:
df_test["entities"] = test_entities_by_row["entities"]
df_test.head()

Unnamed: 0,chinese,english,entities
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...,"[as##ci##mini##b, bc##r-ab##l##1蛋白, 豆蔻酰位点, 抑制剂..."
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...,"[as##ci##mini##b, bc##r-ab##l##1, 看门基因, ga##te..."
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...,"[as##ci##mini##b, 费城染色体阳性白血病, 抗白血病]"
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ...","[慢, 加速期慢性髓系白血病(cm##l), atp竞争性酪氨酸激酶抑制剂(t##ki)]"
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...,[as##ci##mini##b]


In [41]:
df_test.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test_entities.parquet")

In [15]:
export_entities = ["".join(x['tokens']) for x in chinese_entities]

In [16]:
export_entities[0]

'激素疗法'

### Get English Triples

In [None]:
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
texts = df.english.tolist()
# Extract entities
english_entities = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = pipe(text)
    for entity in entities:
        # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
        english_entities.append(entity)
df_entities_en = pd.DataFrame(english_entities)

In [18]:
import requests

def get_entity_triples(entity_name):
    # Construct the query URL
    url = f"http://shuyantech.com/api/cndbpedia/avpair?q={entity_name}"
    
    # Send GET request to the API
    response = requests.get(url)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        
        if data['status'] == 'ok':
            return data['ret']  # Return the list of attribute-value pairs
        else:
            print(f"Error: {data['status']}")
            return []
    else:
        print(f"Request failed with status code {response.status_code}")
        return []

# Example usage
entity = "复旦大学"
triples = get_entity_triples(entity)
print(triples)


[['中文名', '复旦大学'], ['外文名称', 'Fudan University'], ['简称', '复旦·FUDAN'], ['创办时间', '1905年06月29日'], ['创办人', '马相伯'], ['办学性质', '公办大学'], ['学校类别', '综合类'], ['学校特色', '双一流(2017年、2022年)'], ['学校特色', '985工程(1999年)'], ['学校特色', '211工程(1994年)'], ['学校特色', '111计划(2006年)'], ['学校特色', '卓越医生教育培养计划(2012年)'], ['学校特色', '卓越法律人才培养计划(2012年)'], ['学校特色', '环太平洋大学联盟(1997年)'], ['学校特色', '全国重点大学(1959年)'], ['学校特色', '中俄综合性大学联盟(2016年)'], ['学校特色', '全球大学高研院联盟'], ['学校特色', '九校联盟（C9）(2018年)'], ['学校特色', '中国旅游教育合作联盟'], ['学校特色', '亚洲校园'], ['学校特色', '医学“双一流”建设联盟(2018年)'], ['主管部门', '中华人民共和国教育部'], ['现任领导', '裘新(党委书记)、金力(校长)'], ['专职院士数', '中国科学院院士21人'], ['专职院士数', '中国工程院院士5人'], ['本科专业', '80个'], ['硕士点', '学术学位授权一级学科43个'], ['硕士点', '专业学位授权类别30个'], ['博士点', '学术学位授权一级学科40个'], ['博士点', '专业学位授权类别5个'], ['博士后', '科研流动站37个'], ['国家重点学科', '一级学科11个'], ['国家重点学科', '二级学科19个、培育学科3个'], ['院系设置', '35个'], ['校训', '博学而笃志'], ['校训', '切问而近思'], ['校歌', '复旦大学校歌'], ['校庆日', '5月27日'], ['院训', '博学而笃志'], ['院训', '切问而近思'], ['地址', '邯郸校区：上海市杨浦区邯郸路220号'], ['地址', '枫林校区：上海市徐汇区东安路130号'], [

In [23]:
import pandas as pd

# Create an empty DataFrame with columns for entity, attribute, and value
columns = ['Entity', 'Attribute', 'Value']
triples_array = []
df_triples = pd.DataFrame(columns=columns)


for entity in tqdm.tqdm(export_entities):
    triples = get_entity_triples(entity)
    for i in triples:
        triple = [entity, *i]
        triples_array.append(triple)

  0%|          | 16/206573 [00:07<26:02:13,  2.20it/s]


KeyboardInterrupt: 

In [24]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# API base URL for cndbpedia
BASE_URL = "http://shuyantech.com/api/cndbpedia/avpair"

# Function to call the API and retrieve triples for an entity
failed_entities = []
def fetch_triples(entity):
    global failed_entities
    url = f"{BASE_URL}?q={entity}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()  # Assuming the response is in JSON format
        if data['status'] == 'ok':
            triples = data['ret']
            return entity, triples
        else:
            return entity, []
    except requests.exceptions.RequestException as e:
        failed_entities.append(entity)
        return entity, []

# Function to store the triples in a DataFrame
def store_triples_in_df(entities):
    df = pd.DataFrame(columns=['Entity', 'Attribute', 'Value'])
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Submit tasks to be run concurrently
        futures = {executor.submit(fetch_triples, entity): entity for entity in entities}

        # Collect results as they complete
        for future in tqdm.tqdm(as_completed(futures)):
            entity, triples = future.result()
            for triple in triples:
                # Each triple is expected to be in the format [attribute, value]
                df = df.append({'Entity': entity, 'Attribute': triple[0], 'Value': triple[1]}, ignore_index=True)
    
    return df

# List of entities to fetch data for
entities = export_entities

# Get the triples and store in DataFrame
df = store_triples_in_df(entities)


1it [00:00,  3.46it/s]


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-triples.parquet")

In [17]:
ans

[{'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['慢', '阻', '肺']},
 {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['冠', '心', '病']},
 {'type': 'Drug', 'tokens': ['心', '可', '舒']},
 {'type': 'Drug', 'tokens': ['保', '心', '丸']},
 {'type': 'DiseaseNameOrComprehensiveCertificate',
  'tokens': ['双', '下', '肢', '静', '脉', '血', '栓']},
 {'type': 'DiseaseNameOrComprehensiveCertificate',
  'tokens': ['左', '侧', '腹', '股', '沟', '斜', '疝', '无', '张', '力', '修', '补', '术']},
 {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['高', '血', '压']},
 {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['糖', '尿', '病']},
 {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['肝', '炎']},
 {'type': 'DiseaseNameOrComprehensiveCertificate', 'tokens': ['结', '核']}]

# Code for extracting the train/test data for NEJM

In [7]:
import pandas as pd

def combine_sentence_pairs(chinese_file, english_file):
    # Open and read the files
    with open(chinese_file, 'r', encoding='utf-8') as ch_file:
        chinese_sentences = [line.strip() for line in ch_file]
        
    with open(english_file, 'r', encoding='utf-8') as en_file:
        english_sentences = [line.strip() for line in en_file]
    
    # Ensure both files have the same number of lines
    if len(chinese_sentences) != len(english_sentences):
        raise ValueError("The number of sentences in the Chinese and English files do not match.")
    
    # Create a DataFrame with the paired sentences
    df = pd.DataFrame({
        'chinese': chinese_sentences,
        'english': english_sentences
    })
    
    return df

# Example: Replace these file paths with your actual files
chinese_file = './nejm/processed_data/open_access/open_access/nejm.train.zh'
english_file = './nejm/processed_data/open_access/open_access/nejm.train.en'

df_train = combine_sentence_pairs(chinese_file, english_file)

# Show the first few rows of the DataFrame
df_train.head()
df_train.to_parquet('nejm/nejm_train.parquet')


In [1]:
tokenized_dataset = load_dataset("parquet",data_files="nejm/tokenized/zh-en-tokenized-train-working-model.parquet")["train"]
tokenized_input = tokenized_dataset['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(tokenized_input)
tokens[5:10]

NameError: name 'load_dataset' is not defined

In [8]:
df_train.head()

Unnamed: 0,chinese,english
0,也许 不能 : 分析 结果 提示 激素 疗法 在 维持 去 脂 体重 方面 作用 很小 .,probably not : analysis suggests minimal effec...
1,与 衰老 相关 的 肌肉 萎缩 是 活动 能力 下降 的 重要 预测 因素 .,age @-@ related loss of muscle mass ( sarcopen...
2,研究者 确定 了 12 项 评估 去 脂 体重 变化 情况 的 绝经期 激素 疗法 随机 试验 .,investigators identified 12 randomized trials ...
3,"4 , 474 名 参与者 的 平均年龄 为 58 岁 , 平均 随访 时间 为 2 年 .","among 4474 participants , mean age was 58 and ..."
4,强化 老年人 出院 时 的 血压 控制 方案 与 再 入院 和 严重 不良 事件 增加 相关 .,Intensifying blood pressure regimens at discha...


In [10]:
# Example: Replace these file paths with your actual files
chinese_file = './nejm/processed_data/open_access/open_access/nejm.test.zh'
english_file = './nejm/processed_data/open_access/open_access/nejm.test.en'

df_test = combine_sentence_pairs(chinese_file, english_file)

# Show the first few rows of the DataFrame
df_test.head()
df_test.to_parquet('nejm/nejm_test.parquet')

In [11]:
df_test.head()

Unnamed: 0,chinese,english
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ..."
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...


# Below is code for unused data sourced from various WMT challenges in previous years

In [1]:
import pandas as pd
wmt22_df = pd.read_parquet("wmt22_dataset.parquet")
wmt22_df.head()

Unnamed: 0,english,chinese
0,To analyze the imaging characteristics of vert...,分析胸腰椎骨折后路复位术后椎体“空壳”影像学特征，探讨椎体“空壳”与骨折愈合间的关系。. 回...
1,We investigated the effects of lumination on h...,"为探究光照对虎斑乌贼受精卵孵化的影响,确定其胚胎发育的最佳光照条件,本研究采用单因子试验方法..."
2,To assess the effectiveness of percutaneous pe...,探讨微创经皮椎弓根植钉并同切口通道下减压治疗 A3 型（AO 分型）胸腰椎骨折的临床疗效。....
3,To investigate the factors affecting counting ...,肿瘤细胞生物治疗中单个核细胞采集的影响因素分析. 探讨在肿瘤细胞生物治疗单个核细胞采集中，影...
4,To explore the perception of normality in life...,目的: 探讨慢性心力衰竭患者对生活常态的认知。 设计: 进行诠释现象学研究。 方法: 201...


In [1]:
import gzip
import pandas as pd

def read_wmt_data(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        # Read the data into a list of tuples (source, target, domain)
        data = []
        for line in f:
            src_line, tgt_line, domain = line.strip().split('\t')
            data.append((src_line, tgt_line, domain))
        
    # Convert the list of tuples into a DataFrame for easier manipulation
    df = pd.DataFrame(data, columns=["source", "target", "domain"])
    return df

# Example: Replace 'path_to_your_file.gz' with your actual file path
file_path = '/Users/linaa/Github/corpora/wmt18/corpus.gz'
df = read_wmt_data(file_path)

# Optionally filter by domain (if needed)
filtered_df = df[df['domain'] == 'Medical']  # Replace 'domain-indic' with your desired domain

# Save the DataFrame as Parquet
filtered_df.to_parquet('filtered_data.parquet', index=False)


In [7]:
df[df['domain']=='NEU'].head()

Unnamed: 0,source,target,domain
6994864,心如 在 ， 梦 就 在 ！,"where there is a heart , there is a dream !",NEU
6994865,结果 提高 护理人员 的 基本素质 ； 做好 患者 的 心理 护理 和 乙酰唑胺 负荷 脑 ...,results The key nursing methods were to promot...,NEU
6994866,基因组 和 转录 组 分析 无疑 将 补充 现有 的 蛋白质 组和 遗传学 知识 。,genome and transcriptome analyses will complem...,NEU
6994867,如果 有 任何 需要 的 那种 、 请 不要 犹豫 与 我们 联系 、 我们 一直 在 您 ...,"in any kind of need , please do not hesitate t...",NEU
6994868,花瓣 匙 形微 缺 ， 约 1 毫米 。,"petals spatulate @-@ emarginate , ca . 1 mm .",NEU


In [2]:
df.domain.unique()

array(['CASIA2015', 'CASICTA', 'CASICTB', 'CASICT2015', 'DATA2011',
       'BOOKNOSPLIT', 'BOOKSPLIT', 'NEU', 'NEWSCOMM', 'UN'], dtype=object)

In [8]:
import pandas as pd
import re
import stanza

# Initialize Stanza for Chinese (CoreNLP)
stanza_nlp = stanza.Pipeline('zh', processors='tokenize')

# Example dataframe with English and Chinese abstracts
data = {
    "English": [
        "To analyze the imaging characteristics of vertebral 'shell' phenomenon of thoracolumbar fractures after posterior reduction and to explore the relationship between vertebral 'shell' and fracture healing. "
        "Between January 2013 and December 2015, the clinical data of 116 patients with thoracolumbar fractures treated with posterior pedicle screw-rod system reduction and internal fixation were analyzed retrospectively. "
        "There were 72 males and 44 females, aged 22-66 years (mean, 43 years). Injury causes were traffic accident in 24 cases, falling from height in 54 cases, bruise in 38 cases. Fracture segment located at T <sub>11</sub> in 5 cases, T <sub>12</sub> in 38 cases, L <sub>1</sub> in 52 cases, L <sub>2</sub> in 21 cases. "
        "There were 51 cases of compressive fracture and 65 cases of burst fracture. The sagittal Cobb angle ranged from 8 to 27°, with an average of 15°. Degree of preoperative spinal compression ranged from 20% to 75%, with an average of 44%. "
        "Bone density measurement showed that normal bone mass in 30 cases, bone loss in 40 cases, osteoporosis in 41 cases, and severe osteoporosis in 5 cases. The number, pathological characteristics, and imaging regularity of the vertebral 'shell' phenomenon were observed and analyzed by logistic regression. "
        "All patients were followed up 11-18 months with an average of 13 months. A total of 72 cases of vertebral 'shell' phenomenon mainly located in the vertebral anterior column and the end plate near the weak area (54/72, 75.0%). "
        "Most of them were in the irregular shape (50/72, 69.5%). The vertebral fracture line was related to the shape of the vertebral body and the displacement of the vertebral body after reduction. The outcome of the 'shell' can be divided into disappeared type, reduced type, and collapse type, the volume of vertebral 'shell' and its outcome were the risk factors for vertebral fracture healing. "
        "The incidence of vertebral 'shell' and nonuion of thoracolumbar fractures after posterior reduction are high. The main influencing factors are vertebral 'shell' outcome and size."
    ],
    "Chinese": [
        "分析胸腰椎骨折后路复位术后椎体“空壳”影像学特征，探讨椎体“空壳”与骨折愈合间的关系。. "
        "回顾分析 2013 年 1 月—2015 年 12 月，采用经后路椎弓根钉棒系统复位内固定术治疗的 116 例胸腰椎骨折患者临床资料。男 72 例，女 44 例；年龄 22～66 岁，平均 43 岁。致伤原因：交通事故伤 24 例，高处坠落伤 54 例，重物砸伤 38 例。"
        "骨折节段：T <sub>11</sub> 5 例，T <sub>12</sub> 38 例，L <sub>1</sub> 52 例，L <sub>2</sub> 21 例。压缩性骨折 51 例，爆裂性骨折 65 例。矢状面 Cobb 角 8～27°，平均 15°；伤椎前缘压缩程度 20%～75%，平均 44%。"
        "骨密度测量显示：骨量正常 30 例，骨量减少 40 例，骨质疏松 41 例，严重骨质疏松 5 例。观察术后椎体“空壳”现象发生例数、病理特点及影像学规律，并进行多因素 logistic 回归分析。."
        "术后患者均获随访，随访时间 11～18 个月，平均 13 个月。共 72 例出现椎体“空壳”现象，主要集中于椎体前柱及上终板薄弱区附近（54/72，75.0%），以不规则形为主（50/72，69.5%）。"
        "椎体骨折线走行与椎体“空壳”形态和复位后椎体骨折块移位有关；“空壳”形态转归可分为消失型、缩小型和塌陷型，椎体“空壳”体积和转归类型是影响椎体骨折愈合的危险因素。."
        "胸腰椎骨折后路复位术后椎体“空壳”发生率及骨折不愈合率均较高，椎体“空壳”转归类型及体积是其主要影响因素。."
    ]
}

df = pd.DataFrame(data)

# Function to split English sentences based on punctuation (handling decimals correctly)
def split_english_sentences(text):
    # Regex to match sentence-ending punctuation, avoiding numbers and abbreviations
    sentence_endings = r'(?<!\d)\.(?!\d)(?<=\.|\!|\?)\s*'
    
    # Split based on sentence-ending punctuation
    sentences = [sentence.strip() for sentence in re.split(sentence_endings, text.strip()) if sentence]
    return sentences

# Function to split Chinese sentences using Stanza (CoreNLP)
def split_chinese_sentences(text):
    try:
        doc = stanza_nlp(text)
        return [sentence.text.strip() for sentence in doc.sentences]
    except Exception as e:
        print(f"Error processing Chinese text: {e}")
        return []

# Function to align the sentences of both English and Chinese texts
def align_sentences(english_text, chinese_text):
    english_sentences = split_english_sentences(english_text)
    chinese_sentences = split_chinese_sentences(chinese_text)

    print("English sentences:", english_sentences)  # Debugging
    print("Chinese sentences:", chinese_sentences)  # Debugging

    # Ensure both lists are of the same length by padding with None if necessary
    max_length = max(len(english_sentences), len(chinese_sentences))
    english_sentences.extend([None] * (max_length - len(english_sentences)))
    chinese_sentences.extend([None] * (max_length - len(chinese_sentences)))

    return pd.DataFrame({
        "English": english_sentences,
        "Chinese": chinese_sentences
    })

# Process each abstract and split sentences, aligning the English and Chinese sentences
aligned_sentences = pd.concat(
    [align_sentences(row["English"], row["Chinese"]) for _, row in df.iterrows()],
    ignore_index=True
)

aligned_sentences.head()

2024-11-30 14:27:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 21.3MB/s]                    
2024-11-30 14:27:48 INFO: Downloaded file to /Users/linaa/stanza_resources/resources.json
2024-11-30 14:27:48 INFO: "zh" is an alias for "zh-hans"
2024-11-30 14:27:48 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsdsimp |

2024-11-30 14:27:48 INFO: Using device: cpu
2024-11-30 14:27:48 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-30 14:27:48 INFO: Done loading processors!


English sentences: ["To analyze the imaging characteristics of vertebral 'shell' phenomenon of thoracolumbar fractures after posterior reduction and to explore the relationship between vertebral 'shell' and fracture healing", 'Between January 2013 and December 2015, the clinical data of 116 patients with thoracolumbar fractures treated with posterior pedicle screw-rod system reduction and internal fixation were analyzed retrospectively', 'There were 72 males and 44 females, aged 22-66 years (mean, 43 years)', 'Injury causes were traffic accident in 24 cases, falling from height in 54 cases, bruise in 38 cases', 'Fracture segment located at T <sub>11</sub> in 5 cases, T <sub>12</sub> in 38 cases, L <sub>1</sub> in 52 cases, L <sub>2</sub> in 21 cases', 'There were 51 cases of compressive fracture and 65 cases of burst fracture', 'The sagittal Cobb angle ranged from 8 to 27°, with an average of 15°', 'Degree of preoperative spinal compression ranged from 20% to 75%, with an average of 44

Unnamed: 0,English,Chinese
0,To analyze the imaging characteristics of vert...,分析胸腰椎骨折后路复位术后椎体“空壳”影像学特征，探讨椎体“空壳”与骨折愈合间的关系。
1,"Between January 2013 and December 2015, the cl...",. 回顾分析 2013 年 1 月—2015 年 12 月，采用经后路椎弓根钉棒系统复位内固...
2,"There were 72 males and 44 females, aged 22-66...",男 72 例，女 44 例；年龄 22～66 岁，平均 43 岁。
3,Injury causes were traffic accident in 24 case...,致伤原因：交通事故伤 24 例，高处坠落伤 54 例，重物砸伤 38 例。
4,Fracture segment located at T <sub>11</sub> in...,骨折节段：T <sub>11</sub> 5 例，T <sub>12</sub> 38 例，...


In [14]:
import pandas as pd
pd.read_csv("test_data/gold_test_files/medline_en2zh_zh.txt", sep="\t", header=None, names=["DocID", "SentenceID", "TargetSentence"])

Unnamed: 0,DocID,SentenceID,TargetSentence
0,doc1,1,"磷虾是南大洋生态系统的基本组成部分,它们在南极海洋食物网中扮演着关键角色。"
1,doc1,2,"磷虾可能是人类未开发的最大动物蛋白质来源,人类对磷虾资源开发的兴趣日趋增加,但目前关于这些种..."
2,doc1,3,"本文对已有相关研究进行梳理,揭示了不同种磷虾卵巢发育经历相似的生理步骤;磷虾卵母细胞大小存在..."
3,doc1,4,"在此基础上,本文对今后的研究重点提出展望:1)加强南极大磷虾之外磷虾生殖特性的研究,探索磷虾..."
4,doc2,1,鼻蝇蛆病是一种较为罕见的寄生虫病，蝇蛆在鼻腔内生长对鼻腔、鼻窦造成损伤，一旦移行，会导致眼、...
...,...,...,...
357,doc49,1,"近年来,中国近地面臭氧污染问题不断恶化。"
358,doc49,2,"本研究将考虑的主要气象参数包括地表温度、太阳辐射和风速,分为两个方面:(1)光化学反应条件(..."
359,doc49,3,"通过建立臭氧日最大8小时第90百分位浓度与MSI的线性响应关系,发现气象因素对近地面臭氧的贡..."
360,doc49,4,其中有利的光化学反应条件对近地面臭氧污染更为重要。


In [2]:
string = 'Previous research suggested reduced well-being and quality of life in couples with an unfulfilled desire to have a child. However, changes in psychological variables in infertile couples after successful in-vitro fertilization (IVF) have been scarcely investigated. This prospective study explored changes in life satisfaction, stress burden and habitual worry related to\xa0the birth of a child in couples undergoing IVF, and in those experiencing natural pregnancy. In total, 77 couples with successful IVF and 50 couples with natural pregnancy completed the Life Satisfaction Questionnaire, Perceived Stress Questionnaire and Penn State Worry Questionnaire; data were recorded before pregnancy (baseline) and 6 and 12\xa0months after childbirth. Multi-level models were applied for data analysis. Couples with IVF reported lower life satisfaction, and higher stress burden and worry, than those with natural pregnancy at baseline. Moreover, they showed a steep increase in life satisfaction at 6 and 12 months after childbirth, and decreased stress and worry. In couples with natural pregnancy, life satisfaction scores decreased, and those of stress and worry increased, at month 6 after childbirth and returned to initial state at month 12. The group difference at baseline underlines the psychosocial burden of infertility. However, the increase in life satisfaction and decreases in stress and worry suggest that the burden is lessened after the birth of a child. The changes in couples with natural pregnancy reflect the impact of the typical challenges posed by childbirth and successful readjustment during the first year of the child´s life.'
split = string.split(".")
split

['Previous research suggested reduced well-being and quality of life in couples with an unfulfilled desire to have a child',
 ' However, changes in psychological variables in infertile couples after successful in-vitro fertilization (IVF) have been scarcely investigated',
 ' This prospective study explored changes in life satisfaction, stress burden and habitual worry related to\xa0the birth of a child in couples undergoing IVF, and in those experiencing natural pregnancy',
 ' In total, 77 couples with successful IVF and 50 couples with natural pregnancy completed the Life Satisfaction Questionnaire, Perceived Stress Questionnaire and Penn State Worry Questionnaire; data were recorded before pregnancy (baseline) and 6 and 12\xa0months after childbirth',
 ' Multi-level models were applied for data analysis',
 ' Couples with IVF reported lower life satisfaction, and higher stress burden and worry, than those with natural pregnancy at baseline',
 ' Moreover, they showed a steep increase i

In [16]:
test = pd.read_parquet("en2zh_test.parquet")
test.head()

Unnamed: 0,Source,Target,PMID,DocID,SentenceID
0,Euphausiids (krill) is a basic component of th...,"磷虾是南大洋生态系统的基本组成部分,它们在南极海洋食物网中扮演着关键角色。",34898132,doc1,1
1,Krill may be the largest unexploited animal pr...,"磷虾可能是人类未开发的最大动物蛋白质来源,人类对磷虾资源开发的兴趣日趋增加,但目前关于这些种...",34898132,doc1,2
2,There is a great deal of interest in exploitin...,"本文对已有相关研究进行梳理,揭示了不同种磷虾卵巢发育经历相似的生理步骤;磷虾卵母细胞大小存在...",34898132,doc1,3
3,Our understanding is poor about the biology of...,"在此基础上,本文对今后的研究重点提出展望:1)加强南极大磷虾之外磷虾生殖特性的研究,探索磷虾...",34898132,doc1,4
4,Nasal myiasis is a rare parasitic disease,鼻蝇蛆病是一种较为罕见的寄生虫病，蝇蛆在鼻腔内生长对鼻腔、鼻窦造成损伤，一旦移行，会导致眼、...,35232917,doc2,1


# Extract entities from NEJM train set

In [None]:
import spacy
import pandas as pd

# Load a pre-trained NER model
nlp_source = spacy.load("zh_core_web_sm")  # Chinese NER
nlp_target = spacy.load("en_core_web_sm")  # English NER


def extract_entities(sentence, language="source"):
    nlp = nlp_source if language == "source" else nlp_target
    doc = nlp(sentence)
    entities = [ent.text for ent in doc.ents]  # Extract entities
    return entities

df = pd.read_parquet("./nejm/nejm_train.parquet")
en_entities = []
cn_entities = []
for index, row in df.iterrows():
  chinese = row["chinese"]
  cn_entities.append(extract_entities(chinese, language="source"))
  english = row["english"]
  en_entities.append(extract_entities(english, language="target"))

entities_df = pd.DataFrame({"chinese": cn_entities, "english": en_entities})

In [3]:
entities_df.to_csv("./nejm/entities.parquet")

In [4]:
entities_df.head()

Unnamed: 0,chinese,english
0,[],[]
1,[],[]
2,[12],[12]
3,"[4, 474, 58 岁, 2 年]","[4474, 58, 2 years]"
4,[],[]


In [9]:
chinese_entities[2]

{'entity_group': 'LABEL_0',
 'score': np.float32(0.5059147),
 'word': '分 析',
 'start': 8,
 'end': 10}

# Code for running baseline + Generate BLEU Score (Note that SacreBleu is 100x standard Bleu score from 0-1)

In [1]:
import pandas as pd
import tensorflow as tf

# Load Parquet file into Pandas
parquet_file = "nejm/nejm_test.parquet"
df = pd.read_parquet(parquet_file)

# Convert DataFrame to TensorFlow Dataset
def pandas_to_tf_dataset(df):
    return tf.data.Dataset.from_tensor_slices((df["english"].values, df["chinese"].values))

dataset = pandas_to_tf_dataset(df)

# Preview the dataset
for en_text, zh_text in dataset.take(5):
    print(f"English: {en_text.numpy().decode('utf-8')}")
    print(f"Chinese: {zh_text.numpy().decode('utf-8')}")


English: asciminib is an allosteric inhibitor that binds a myristoyl site of the BCR @-@ ABL1 protein , locking BCR @-@ ABL1 into an inactive conformation through a mechanism distinct from those for all other ABL kinase inhibitors .
Chinese: asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构抑制 剂 , 它 可 通过 不同于 所有 其他 ABL 激酶 抑制剂 的 机制 将 BCR - ABL1 锁定 在 非 活性 构象 .
English: asciminib targets both native and mutated BCR @-@ ABL1 , including the gatekeeper T315I mutant .
Chinese: asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括 看门 基因 ( gatekeeper ) T315I 突变体 .
English: the safety and antileukemic activity of asciminib in patients with Philadelphia chromosome @-@ positive leukemia are unknown .
Chinese: asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性 尚未 明确 .
English: in this phase 1 , dose @-@ escalation study , we enrolled 141 patients with chronic @-@ phase and 9 with accelerated @-@ phase chronic myeloid leukemia ( CML ) who had resistance to or unacceptable side effects from at least two previous AT

2024-11-30 22:51:38.948783: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [2]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
from sacrebleu import corpus_bleu
import pandas as pd
import nltk

# Download NLTK's Punkt tokenizer
nltk.download("punkt")

# Step 1: Load a pretrained translation model and tokenizer
def load_translation_model_tf(model_name="Helsinki-NLP/opus-mt-en-zh"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
    return model, tokenizer

# Step 2: Translate source sentences
def translate_sentences_tf(model, tokenizer, sentences, max_length=128):
    inputs = tokenizer(sentences, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
    outputs = model.generate(**inputs, max_length=max_length)
    translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translations

# Step 3: Calculate BLEU score
def calculate_bleu_score(predictions, references):
    bleu = corpus_bleu(predictions, [references])
    print(f"BLEU score: {bleu.score}")
    return bleu

# Step 4: Process gold standard file and perform evaluation
def evaluate_translations_tf(model, tokenizer, gold_standard_file, source_lang, target_lang, output_file=None):
    # Load the gold standard translations
    df = pd.read_parquet(gold_standard_file)
    source_sentences = df[source_lang].tolist()
    reference_sentences = df[target_lang].tolist()

    # Generate translations
    predicted_translations = translate_sentences_tf(model, tokenizer, source_sentences)

    # Save translations to output file (optional)
    if output_file:
        output_df = pd.DataFrame({
            "source": source_sentences,
            "reference": reference_sentences,
            "prediction": predicted_translations
        })
        output_df.to_parquet(output_file, index=False)
        print(f"Saved predictions to {output_file}")

    # Evaluate BLEU
    bleu_score = calculate_bleu_score(predicted_translations, reference_sentences)
    return bleu_score, predicted_translations, reference_sentences


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/linaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


: 

In [None]:
# Step 1: Load the model and tokenizer
# Load the correct model for Chinese to English translation
# All model names use the following format: Helsinki-NLP/opus-mt-{src}-{tgt}:
model_name = "Helsinki-NLP/opus-mt-zh-en"  # Chinese to English Model as baseline value
model, tokenizer = load_translation_model_tf(model_name)

# Step 2: Set up file paths and column names
parquet_file = "nejm/nejm_test.parquet"  # Path to your gold standard file
source_lang = "english"  # Column for source sentences
target_lang = "chinese"  # Column for target sentences
output_file = "nejm/predictions/en2zh_baseline.parquet"  # Optional output file for predictions


# Step 3: Evaluate translations and calculate BLEU
bleu_score, predicted_translations, reference_translations = evaluate_translations_tf(
    model,
    tokenizer,
    parquet_file,
    source_lang,
    target_lang,
    output_file
)

# Step 4: Inspect translations
print("Sample Translations:")
for i in range(3):  # Display first 3 samples
    print(f"Source: {predicted_translations[i]}")
    print(f"Prediction: {predicted_translations[i]}")
    print(f"Reference: {reference_translations[i]}")
    print("-" * 50)

# Step 5: BLEU Score
print(f"Final BLEU Score: {bleu_score.score}")


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-zh.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


# Chinese -> English

In [1]:
from transformers import MarianMTModel, MarianTokenizer
import sacrebleu
import pandas as pd

def load_marian_model(model_name):
    # Load the MarianMT model and tokenizer
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

def translate_sentences(tokenizer, model, sentences, batch_size=32):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)
    return translations

def calculate_bleu(predictions, references):
    # Use sacrebleu to calculate the BLEU score
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Replace these file paths with your actual text files
parquet_file = "nejm/nejm_test.parquet"
df = pd.read_parquet(parquet_file)

# Load MarianNMT pretrained model for zh-en translation
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer, model = load_marian_model(model_name)

# Translate Chinese sentences to English
df["predicted_english"] = translate_sentences(tokenizer, model, df["chinese"].tolist())

# Calculate BLEU score
bleu_score = calculate_bleu(df["predicted_english"].tolist(), df["english"].tolist())

print(f"BLEU Score: {bleu_score}")


BLEU Score: 18.213657715087557


In [3]:
df.head()

Unnamed: 0,chinese,english,predicted_english
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...,Asciminib is an alternative inhibitor that com...
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...,"At the same time, asciminib's target is used a..."
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...,The safety and anti-leukemia activity of ascim...
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ...","In this first dose increment study, we have in..."
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...,The primary purpose of this test is to determi...


In [12]:
parquet_file_train = "/Users/linaa/Github/corpora/nejm/nejm_train.parquet"
df_train = pd.read_parquet(parquet_file)
df_train.head()

Unnamed: 0,chinese,english
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ..."
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...
