# Fine-tuned models with tokenized entities

In [None]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset

# Define custom Entity-Aware Marian Model class
class EntityAwareMarianModel(MarianMTModel):
    def __init__(self, config, max_entities=100):
        super().__init__(config)
        # Entity embedding layer
        self.entity_embedding = nn.Embedding(max_entities, config.d_model)
        
        # Projection layer to integrate entity information
        self.entity_projection = nn.Linear(config.d_model, config.d_model)
        
    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
    # Standard translation model forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        
        if entity_ids is not None:
            # Embed entities
            entity_embeddings = self.entity_embedding(entity_ids)  # Shape: [batch_size, max_entities, hidden_dim]
            
            # Pad or truncate the entity embeddings to match the input sequence length
            entity_embeddings_padded = entity_embeddings.unsqueeze(1).repeat(1, 512, 1)  # Repeat across sequence length
            
            # Project entity embeddings
            entity_features = self.entity_projection(entity_embeddings_padded)
            
            # Add entity features to encoder hidden states
            if hasattr(outputs, 'encoder_last_hidden_state'):
                outputs.encoder_last_hidden_state += entity_features

        return outputs


# Load and prepare the dataset function
def prepare_dataset(parquet_file, tokenizer, src_lang="chinese", tgt_lang="english", max_entity_length=10):
    # Load the Parquet file into a DataFrame
    df = pd.read_parquet(parquet_file)
    
    # Create an entity mapping
    unique_entities = set()
    for entities in df["entities"]:
        unique_entities.update(entities)
    
    # Create entity to ID mapping
    entity_to_id = {entity: idx for idx, entity in enumerate(unique_entities)}
    
    def preprocess_function(examples):
        processed_inputs = []
        processed_entity_ids = []
        
        for sentence, entities in zip(examples[src_lang], examples["entities"]):
            # Preprocess sentence (remove spaces)
            sentence = str(sentence).replace(" ", "")
            processed_inputs.append(sentence)
            
            # Map entities to their IDs, padding or truncating as needed
            sent_entity_ids = [
                entity_to_id.get(entity, 0)  # 0 as default/unknown entity
                for entity in entities
            ]
            # Pad or truncate to a fixed length (e.g., 10)
            sent_entity_ids = (sent_entity_ids + [0] * max_entity_length)[:max_entity_length]
            processed_entity_ids.append(sent_entity_ids)
        
        # Tokenize inputs
        model_inputs = tokenizer(
            processed_inputs,
            max_length=512,
            truncation=True,
            padding="max_length"
        )
        
        # Tokenize labels
        labels = tokenizer(
            examples[tgt_lang],
            max_length=512,
            truncation=True,
            padding="max_length"
        )
        
        # Add entity IDs and labels to the model inputs
        model_inputs["entity_ids"] = processed_entity_ids
        model_inputs["labels"] = labels["input_ids"]
        
        return model_inputs
    
    # Convert to Hugging Face Dataset
    hf_dataset = Dataset.from_pandas(df)
    tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)
    
    # Remove the "entities" column as it is no longer needed
    tokenized_dataset = tokenized_dataset.remove_columns(["entities"])
    tokenized_dataset = tokenized_dataset.remove_columns([src_lang, tgt_lang])
    
    return tokenized_dataset, entity_to_id

# Fine-tuning the custom model
def fine_tune_custom_model(model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)
    
    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        predict_with_generate=True,
        push_to_hub=False,
    )
    
    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )
    
    # Train
    trainer.train()

# Load entity-aware model function
def load_entity_aware_model(model_name, unique_entities, max_entities=100):
    # Load original model
    model = MarianMTModel.from_pretrained(model_name)
    
    # Create custom model with entity awareness
    config = model.config
    entity_aware_model = EntityAwareMarianModel(config, max_entities)
    
    # Copy weights from original model
    entity_aware_model.load_state_dict(model.state_dict(), strict=False)
    
    # Load tokenizer
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    return tokenizer, entity_aware_model

# Main execution
model_name = "Helsinki-NLP/opus-mt-zh-en"
parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train_entities.parquet"
output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-1"
path_to_entities = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet"

# Load entities first
named_entities_df = pd.read_parquet(path_to_entities)
unique_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]

# Load model with entity mapping
tokenizer, model = load_entity_aware_model(model_name, unique_entities)

# Prepare dataset using entity mapping
tokenized_dataset, entity_to_id = prepare_dataset(parquet_file, tokenizer)

In [None]:
tokenized_dataset

In [None]:
import torch
from torch import nn
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd
from datasets import Dataset, load_dataset
import torch.nn.functional as F
import os
import json
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

def load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens):
    # Load tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({
        'additional_special_tokens': list(set(special_tokens))
    })

    # Load base model
    model = MarianMTModel.from_pretrained(model_name)
    
    # Create custom model, CustomMarianMTModel will create a BiomedicalEncoder object in init()
    custom_model = CustomMarianMTModel(
        config=model.config,
        hidden_size=hidden_size,
        special_token_size=len(special_tokens), 
    )


    # Resize token embeddings
    custom_model.resize_token_embeddings(len(tokenizer))
    
    return tokenizer, custom_model

class BiomedicalEncoder(nn.Module):
    def __init__(self, hidden_size, special_token_size):
        super(BiomedicalEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size
        
        # Adjust the linear layer to match input dimensions
        self.linear = nn.Linear(special_token_size, hidden_size)
        self.activation = nn.ReLU()

    def forward(self, entity_embeddings):
        # Reshape entity embeddings if necessary
        original_shape = entity_embeddings.shape
        
        # Flatten the tensor if it has more than 2 dimensions
        if len(original_shape) > 2:
            entity_embeddings = entity_embeddings.view(-1, original_shape[-1])
        
        # Ensure the input matches the expected dimension
        if entity_embeddings.size(1) != self.special_token_size:
            # If the input doesn't match, pad or truncate
            if entity_embeddings.size(1) < self.special_token_size:
                # Pad with zeros
                padding = torch.zeros(
                    entity_embeddings.size(0), 
                    self.special_token_size - entity_embeddings.size(1),
                    device=entity_embeddings.device
                )
                entity_embeddings = torch.cat([entity_embeddings, padding], dim=1)
            else:
                # Truncate
                entity_embeddings = entity_embeddings[:, :self.special_token_size]
        
        # Apply linear transformation and activation
        encoded = self.linear(entity_embeddings)
        return self.activation(encoded)

class CustomMarianMTModel(MarianMTModel):
    def __init__(self, config, hidden_size=512, special_token_size=206573, biomedicalEncoder=None):
        super().__init__(config)
        self.hidden_size = hidden_size
        self.special_token_size = special_token_size

        # Initialize biomedical encoder within the model
        if biomedicalEncoder == None:
            self.biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
        else:
            self.biomedical_encoder = biomedicalEncoder

        # Entity embedding for special tokens
        self.entity_embedding = nn.Embedding(special_token_size + 1, hidden_size)  # +1 for padding token

        # Projection layer to match vocabulary size
        self.entity_projection = nn.Linear(hidden_size, config.vocab_size)

    def save_custom(self, save_directory):
        # Create save directory if it doesn't exist
        os.makedirs(save_directory, exist_ok=True)

        model_save_path = os.path.join(save_directory, "model")
        print(model_save_path)
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        os.makedirs(model_save_path, exist_ok=True)
        os.makedirs(tokenizer_save_path, exist_ok=True)

        # Save the model and its configuration
        self.save_pretrained(model_save_path)

        # Save the biomedical encoder's state_dict
        torch.save(self.biomedical_encoder.state_dict(), os.path.join(model_save_path, "biomedical_encoder.pth"))

        # Save custom attributes in a JSON file
        custom_config = {
            "hidden_size": self.hidden_size,
            "special_token_size": self.special_token_size,
        }
        with open(os.path.join(model_save_path, "custom_config.json"), "w") as f:
            json.dump(custom_config, f)

        if tokenizer is not None:
            tokenizer.save_pretrained(tokenizer_save_path)

    @classmethod
    def from_custom(cls, save_directory):
        model_save_path = os.path.join(save_directory, "model")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load custom attributes from JSON
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Load base model configuration
        model = MarianMTModel.from_pretrained(model_save_path)

        # Create a new CustomMarianMTModel with the loaded configuration
        new_model = cls(
            config=model.config, 
            hidden_size=custom_config["hidden_size"],
            special_token_size=custom_config["special_token_size"]
        )

        # Load the biomedical encoder state dict
        biomedical_encoder_path = os.path.join(model_save_path, "biomedical_encoder.pth")
        biomedical_encoder_state_dict = torch.load(biomedical_encoder_path)
        new_model.biomedical_encoder.load_state_dict(biomedical_encoder_state_dict)

        # Load the main model weights
        state_dict = model.state_dict()
        new_model_state_dict = new_model.state_dict()
        
        # Update the state dictionary, keeping the biomedical encoder weights
        for key, value in state_dict.items():
            if key in new_model_state_dict:
                new_model_state_dict[key] = value
        
        new_model.load_state_dict(new_model_state_dict, strict=False)

        # Load tokenizer
        tokenizer = MarianTokenizer.from_pretrained(tokenizer_save_path)

        return new_model, tokenizer

    def forward(self, input_ids=None, attention_mask=None, labels=None, entity_ids=None, **kwargs):
        # Perform base MarianMT forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

        # Process entity information if provided
        if entity_ids is not None:
            try:
                # Ensure entity_ids is a tensor with 2 dimensions
                if len(entity_ids.shape) == 1:
                    entity_ids = entity_ids.unsqueeze(0)

                # Get batch size, sequence length, and vocab size from outputs
                batch_size = outputs.logits.size(0)
                sequence_length = outputs.logits.size(1)
                vocab_size = outputs.logits.size(2)

                # Ensure entity_ids is on the same device as outputs.logits
                entity_ids = entity_ids.to(outputs.logits.device)

                # Limit entity_ids to current batch size
                entity_ids = entity_ids[:batch_size]

                if torch.any(entity_ids >= self.entity_embedding.num_embeddings):
                    print(f"Invalid entity IDs detected: {entity_ids}")
                    raise ValueError("Entity IDs are out of bounds for the embedding layer")


                # Get embeddings for entity special tokens
                entity_embeddings = self.entity_embedding(entity_ids)

                # Ensure embeddings are on the correct device
                entity_embeddings = entity_embeddings.to(outputs.logits.device)

                # Process through biomedical encoder
                original_shape = entity_embeddings.shape
                entity_features = self.biomedical_encoder(entity_embeddings.view(-1, original_shape[-1]))

                # Reshape back to original batch and entity dimension
                entity_features = entity_features.view(original_shape[0], original_shape[1], -1)

                # Project entity features to match logits dimensionality
                entity_logits = self.entity_projection(entity_features)

                # Ensure logits are on the correct device
                entity_logits = entity_logits.to(outputs.logits.device)

                # Create a tensor of zeros with the same shape as outputs.logits
                expanded_entity_logits = torch.zeros_like(outputs.logits)

                # Adjust logits shape to match the entity length
                min_entities = min(entity_logits.size(1), expanded_entity_logits.size(1))
                min_vocab = min(entity_logits.size(2), expanded_entity_logits.size(2))

                expanded_entity_logits[:, :min_entities, :min_vocab] = entity_logits[:, :min_entities, :min_vocab]

                # Add entity-based logits to original logits
                outputs.logits = outputs.logits + expanded_entity_logits

            except Exception as e:
                print(f"Error in forward method: {e}")
                raise
        torch.cuda.synchronize()
        return outputs



def prepare_dataset(dataset, tokenizer, src_lang="chinese", tgt_lang="english", max_entities=5):
    def preprocess_function(examples):
        # Ensure inputs are lists
        src_sentences = examples[src_lang]
        tgt_sentences = examples[tgt_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])

        processed_src = []
        processed_entities = []
        
        for sentence, entities in zip(src_sentences, entities_list):
            # Ensure sentence is a string and remove existing spaces
            sentence = str(sentence).replace(" ", "")
            
            # Add special tokens for entities
            for entity in entities:
                sentence = sentence.replace(entity, f"<<{entity}>>")
            
            processed_src.append(sentence)
            
            # Convert entities to token IDs
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>") 
                for entity in entities
            ]
            
            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))
            
            # Debugging: Log entity ids and padding
            # print(f"Entity IDs (after padding/truncation): {entity_ids}")

            processed_entities.append(entity_ids)

        # Tokenize source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Tokenize target sentences
        labels = tokenizer(
            tgt_sentences,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]
        
        # Convert entity_ids to tensor
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)
        
        return model_inputs

    # Apply preprocessing to the dataset
    processed_dataset = dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=dataset.column_names
    )

    return processed_dataset

def fine_tune_custom_model(custom_model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        save_safetensors=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
        fp16=False  # Disable mixed precision
    )

    # Create trainer
    trainer = Seq2SeqTrainer(
        model=custom_model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Configuration
model_name = "Helsinki-NLP/opus-mt-zh-en"
output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2"
hidden_size = 512

# Load named entities
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]
special_token_size = len(special_tokens)

# Load tokenizer and custom model
tokenizer, custom_model = load_marian_with_biomedical_layer(
    model_name, 
    hidden_size, 
    special_tokens
)

# Load dataset
dataset = load_dataset("parquet", data_files="nejm/nejm_train_entities.parquet")["train"]

# Prepare dataset
tokenized_dataset = prepare_dataset(
    dataset, 
    tokenizer, 
    src_lang="chinese", 
    tgt_lang="english"
)

dataset.to_parquet("nejm/512_tokenized_nejm_train_entities.parquet")

Map: 100%|██████████| 62127/62127 [00:21<00:00, 2886.20 examples/s]
Creating parquet from Arrow format: 100%|██████████| 63/63 [00:00<00:00, 520.05ba/s]


25052770

: 

In [None]:

# Configuration
model_name = "Helsinki-NLP/opus-mt-zh-en"
output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2"
hidden_size = 512

# Load named entities
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]
special_token_size = len(special_tokens)

# Load tokenizer and custom model
tokenizer, custom_model = load_marian_with_biomedical_layer(
    model_name, 
    hidden_size, 
    special_tokens
)

# Load dataset
dataset = load_dataset("parquet", data_files="nejm/nejm_train_entities.parquet")["train"]

# Prepare dataset
tokenized_dataset = prepare_dataset(
    dataset, 
    tokenizer, 
    src_lang="chinese", 
    tgt_lang="english"
)

# Before training
# Move all components to GPU before training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)
# tokenizer = tokenizer.to(device)

# Add explicit error checking
torch.cuda.empty_cache()  # Clear GPU memory before training

# Fine-tune the model
fine_tune_custom_model(
    custom_model, 
    tokenizer, 
    tokenized_dataset, 
    output_dir
)

In [2]:
# Configuration
model_name = "Helsinki-NLP/opus-mt-zh-en"
output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2"
hidden_size = 512

# Load named entities
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]
special_token_size = len(special_tokens)

# Load tokenizer and custom model
tokenizer, custom_model = load_marian_with_biomedical_layer(
    model_name, 
    hidden_size, 
    special_tokens
)

tokenized_dataset = load_dataset("parquet", data_files="C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\tokenized\\zh-en-tokenized-train-working-model.parquet")["train"]

# Before training
# Move all components to GPU before training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)
# tokenizer = tokenizer.to(device)

# Add explicit error checking
torch.cuda.empty_cache()  # Clear GPU memory before training

# Fine-tune the model
fine_tune_custom_model(
    custom_model, 
    tokenizer, 
    tokenized_dataset, 
    output_dir
)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maalin[0m ([33maalin-uc-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin


  1%|          | 100/10485 [04:29<6:29:10,  2.25s/it]

{'loss': 8.9977, 'grad_norm': 10.107197761535645, 'learning_rate': 4.9523128278493086e-05, 'epoch': 0.03}


  2%|▏         | 200/10485 [10:01<5:33:17,  1.94s/it] 

{'loss': 5.7475, 'grad_norm': 8.768858909606934, 'learning_rate': 4.904625655698618e-05, 'epoch': 0.06}


  3%|▎         | 300/10485 [13:08<5:18:11,  1.87s/it]

{'loss': 4.0411, 'grad_norm': 5.347640514373779, 'learning_rate': 4.856938483547926e-05, 'epoch': 0.09}


  4%|▍         | 400/10485 [16:11<5:03:02,  1.80s/it]

{'loss': 2.9677, 'grad_norm': 1.1317561864852905, 'learning_rate': 4.8092513113972345e-05, 'epoch': 0.11}


  5%|▍         | 500/10485 [19:17<5:00:08,  1.80s/it]

{'loss': 2.4141, 'grad_norm': 0.8909512758255005, 'learning_rate': 4.761564139246543e-05, 'epoch': 0.14}


  6%|▌         | 600/10485 [22:27<5:02:38,  1.84s/it]

{'loss': 2.2436, 'grad_norm': 0.7377718687057495, 'learning_rate': 4.713876967095851e-05, 'epoch': 0.17}


  7%|▋         | 700/10485 [25:38<5:09:50,  1.90s/it]

{'loss': 2.1422, 'grad_norm': 0.7214361429214478, 'learning_rate': 4.66618979494516e-05, 'epoch': 0.2}


  8%|▊         | 800/10485 [28:48<5:07:24,  1.90s/it]

{'loss': 2.0997, 'grad_norm': 0.6726440787315369, 'learning_rate': 4.618502622794468e-05, 'epoch': 0.23}


  9%|▊         | 900/10485 [31:59<5:03:05,  1.90s/it]

{'loss': 1.9765, 'grad_norm': 0.6875271201133728, 'learning_rate': 4.570815450643777e-05, 'epoch': 0.26}


 10%|▉         | 1000/10485 [35:10<5:00:57,  1.90s/it]

{'loss': 1.9157, 'grad_norm': 0.732824444770813, 'learning_rate': 4.5231282784930856e-05, 'epoch': 0.29}


 10%|█         | 1100/10485 [38:17<4:41:00,  1.80s/it]

{'loss': 1.9045, 'grad_norm': 1.0072081089019775, 'learning_rate': 4.475441106342394e-05, 'epoch': 0.31}


 11%|█▏        | 1200/10485 [41:22<4:42:25,  1.82s/it]

{'loss': 1.8615, 'grad_norm': 0.8328938484191895, 'learning_rate': 4.4277539341917024e-05, 'epoch': 0.34}


 12%|█▏        | 1300/10485 [44:24<4:43:56,  1.85s/it]

{'loss': 1.8697, 'grad_norm': 0.889186680316925, 'learning_rate': 4.3800667620410114e-05, 'epoch': 0.37}


 13%|█▎        | 1400/10485 [47:28<4:31:58,  1.80s/it]

{'loss': 1.7539, 'grad_norm': 0.7987344861030579, 'learning_rate': 4.33237958989032e-05, 'epoch': 0.4}


 14%|█▍        | 1500/10485 [50:40<4:49:51,  1.94s/it]

{'loss': 1.7864, 'grad_norm': 0.8306804895401001, 'learning_rate': 4.284692417739628e-05, 'epoch': 0.43}


 15%|█▌        | 1600/10485 [53:44<4:28:11,  1.81s/it]

{'loss': 1.6972, 'grad_norm': 0.7653444409370422, 'learning_rate': 4.237005245588937e-05, 'epoch': 0.46}


 16%|█▌        | 1700/10485 [56:45<4:29:15,  1.84s/it]

{'loss': 1.6908, 'grad_norm': 0.8932445645332336, 'learning_rate': 4.189318073438245e-05, 'epoch': 0.49}


 17%|█▋        | 1800/10485 [59:57<4:39:40,  1.93s/it]

{'loss': 1.6665, 'grad_norm': 0.9426655769348145, 'learning_rate': 4.1416309012875534e-05, 'epoch': 0.52}


 18%|█▊        | 1900/10485 [1:03:09<4:35:12,  1.92s/it]

{'loss': 1.66, 'grad_norm': 0.8182628750801086, 'learning_rate': 4.0939437291368625e-05, 'epoch': 0.54}


 19%|█▉        | 2000/10485 [1:06:22<4:37:18,  1.96s/it]

{'loss': 1.6528, 'grad_norm': 0.7854791283607483, 'learning_rate': 4.046256556986171e-05, 'epoch': 0.57}


 20%|██        | 2100/10485 [1:09:35<4:28:59,  1.92s/it]

{'loss': 1.6229, 'grad_norm': 0.9765541553497314, 'learning_rate': 3.998569384835479e-05, 'epoch': 0.6}


 21%|██        | 2200/10485 [1:12:46<4:01:17,  1.75s/it]

{'loss': 1.5948, 'grad_norm': 0.8807556629180908, 'learning_rate': 3.950882212684788e-05, 'epoch': 0.63}


 22%|██▏       | 2300/10485 [1:15:45<4:13:32,  1.86s/it]

{'loss': 1.5437, 'grad_norm': 0.9429752230644226, 'learning_rate': 3.903195040534097e-05, 'epoch': 0.66}


 23%|██▎       | 2400/10485 [1:18:47<4:06:02,  1.83s/it]

{'loss': 1.5176, 'grad_norm': 0.9150956273078918, 'learning_rate': 3.855507868383405e-05, 'epoch': 0.69}


 24%|██▍       | 2500/10485 [1:21:50<4:03:22,  1.83s/it]

{'loss': 1.532, 'grad_norm': 0.8026453256607056, 'learning_rate': 3.8078206962327136e-05, 'epoch': 0.72}


 25%|██▍       | 2600/10485 [1:24:53<4:06:03,  1.87s/it]

{'loss': 1.4984, 'grad_norm': 0.7272871136665344, 'learning_rate': 3.7601335240820226e-05, 'epoch': 0.74}


 26%|██▌       | 2700/10485 [1:28:00<4:02:52,  1.87s/it]

{'loss': 1.5027, 'grad_norm': 0.8717427253723145, 'learning_rate': 3.712446351931331e-05, 'epoch': 0.77}


 27%|██▋       | 2800/10485 [1:31:06<3:38:27,  1.71s/it]

{'loss': 1.493, 'grad_norm': 0.9254312515258789, 'learning_rate': 3.6647591797806394e-05, 'epoch': 0.8}


 28%|██▊       | 2900/10485 [1:34:13<3:56:50,  1.87s/it]

{'loss': 1.4484, 'grad_norm': 0.8104346990585327, 'learning_rate': 3.617072007629947e-05, 'epoch': 0.83}


 29%|██▊       | 3000/10485 [1:37:17<3:47:39,  1.82s/it]

{'loss': 1.4717, 'grad_norm': 0.8786535859107971, 'learning_rate': 3.569384835479256e-05, 'epoch': 0.86}


 30%|██▉       | 3100/10485 [1:40:19<3:43:58,  1.82s/it]

{'loss': 1.4213, 'grad_norm': 0.9241299033164978, 'learning_rate': 3.5216976633285646e-05, 'epoch': 0.89}


 31%|███       | 3200/10485 [1:43:25<3:46:51,  1.87s/it]

{'loss': 1.4814, 'grad_norm': 1.0273315906524658, 'learning_rate': 3.474010491177873e-05, 'epoch': 0.92}


 31%|███▏      | 3300/10485 [1:46:33<3:45:28,  1.88s/it]

{'loss': 1.4167, 'grad_norm': 0.8897446990013123, 'learning_rate': 3.426323319027182e-05, 'epoch': 0.94}


 32%|███▏      | 3400/10485 [1:49:42<3:42:31,  1.88s/it]

{'loss': 1.4294, 'grad_norm': 0.934633731842041, 'learning_rate': 3.3786361468764905e-05, 'epoch': 0.97}


                                                        


{'eval_loss': 1.3367047309875488, 'eval_runtime': 170.7677, 'eval_samples_per_second': 36.383, 'eval_steps_per_second': 2.278, 'epoch': 1.0}


 33%|███▎      | 3500/10485 [1:55:32<27:58:21, 14.42s/it] 

{'loss': 1.4109, 'grad_norm': 0.8515102863311768, 'learning_rate': 3.330948974725799e-05, 'epoch': 1.0}


 34%|███▍      | 3600/10485 [1:58:39<3:23:04,  1.77s/it] 

{'loss': 1.3737, 'grad_norm': 1.0019389390945435, 'learning_rate': 3.283261802575107e-05, 'epoch': 1.03}


 35%|███▌      | 3700/10485 [2:01:50<3:39:17,  1.94s/it]

{'loss': 1.3692, 'grad_norm': 0.9518450498580933, 'learning_rate': 3.2355746304244164e-05, 'epoch': 1.06}


 36%|███▌      | 3800/10485 [2:05:03<3:36:55,  1.95s/it]

{'loss': 1.3468, 'grad_norm': 1.0818392038345337, 'learning_rate': 3.187887458273725e-05, 'epoch': 1.09}


 37%|███▋      | 3900/10485 [2:08:16<3:26:17,  1.88s/it]

{'loss': 1.3488, 'grad_norm': 0.8986314535140991, 'learning_rate': 3.140200286123033e-05, 'epoch': 1.12}


 38%|███▊      | 4000/10485 [2:11:21<3:18:11,  1.83s/it]

{'loss': 1.3325, 'grad_norm': 0.8874548077583313, 'learning_rate': 3.0925131139723415e-05, 'epoch': 1.14}


 39%|███▉      | 4100/10485 [2:14:25<3:11:03,  1.80s/it]

{'loss': 1.324, 'grad_norm': 0.9662432074546814, 'learning_rate': 3.0448259418216503e-05, 'epoch': 1.17}


 40%|████      | 4200/10485 [2:17:22<3:15:48,  1.87s/it]

{'loss': 1.3186, 'grad_norm': 1.0675803422927856, 'learning_rate': 2.9971387696709587e-05, 'epoch': 1.2}


 41%|████      | 4300/10485 [2:20:28<3:09:04,  1.83s/it]

{'loss': 1.3183, 'grad_norm': 0.9741929173469543, 'learning_rate': 2.9494515975202674e-05, 'epoch': 1.23}


 42%|████▏     | 4400/10485 [2:23:31<3:07:05,  1.84s/it]

{'loss': 1.283, 'grad_norm': 0.8321027159690857, 'learning_rate': 2.9017644253695758e-05, 'epoch': 1.26}


 43%|████▎     | 4500/10485 [2:26:37<3:12:29,  1.93s/it]

{'loss': 1.2938, 'grad_norm': 1.0482348203659058, 'learning_rate': 2.8540772532188842e-05, 'epoch': 1.29}


 44%|████▍     | 4600/10485 [2:29:43<2:58:20,  1.82s/it]

{'loss': 1.2951, 'grad_norm': 1.1143715381622314, 'learning_rate': 2.8063900810681926e-05, 'epoch': 1.32}


 45%|████▍     | 4700/10485 [2:32:45<2:55:24,  1.82s/it]

{'loss': 1.2522, 'grad_norm': 1.0040342807769775, 'learning_rate': 2.7587029089175013e-05, 'epoch': 1.34}


 46%|████▌     | 4800/10485 [2:35:51<3:01:35,  1.92s/it]

{'loss': 1.266, 'grad_norm': 0.9950077533721924, 'learning_rate': 2.7110157367668097e-05, 'epoch': 1.37}


 47%|████▋     | 4900/10485 [2:39:06<2:52:22,  1.85s/it]

{'loss': 1.2771, 'grad_norm': 1.0714131593704224, 'learning_rate': 2.663328564616118e-05, 'epoch': 1.4}


 48%|████▊     | 5000/10485 [2:42:17<2:55:36,  1.92s/it]

{'loss': 1.2503, 'grad_norm': 0.9988529682159424, 'learning_rate': 2.6156413924654272e-05, 'epoch': 1.43}


 49%|████▊     | 5100/10485 [2:45:29<2:52:10,  1.92s/it]

{'loss': 1.244, 'grad_norm': 1.1736501455307007, 'learning_rate': 2.5679542203147356e-05, 'epoch': 1.46}


 50%|████▉     | 5200/10485 [2:48:23<2:36:44,  1.78s/it]

{'loss': 1.272, 'grad_norm': 1.1071336269378662, 'learning_rate': 2.520267048164044e-05, 'epoch': 1.49}


 51%|█████     | 5300/10485 [2:51:23<2:35:10,  1.80s/it]

{'loss': 1.2405, 'grad_norm': 1.0398554801940918, 'learning_rate': 2.4725798760133524e-05, 'epoch': 1.52}


 52%|█████▏    | 5400/10485 [2:54:18<2:27:51,  1.74s/it]

{'loss': 1.2181, 'grad_norm': 1.097158670425415, 'learning_rate': 2.4248927038626608e-05, 'epoch': 1.55}


 52%|█████▏    | 5500/10485 [2:57:17<2:28:35,  1.79s/it]

{'loss': 1.2454, 'grad_norm': 0.9179044365882874, 'learning_rate': 2.3772055317119695e-05, 'epoch': 1.57}


 53%|█████▎    | 5600/10485 [3:00:10<2:19:03,  1.71s/it]

{'loss': 1.1903, 'grad_norm': 0.9687941670417786, 'learning_rate': 2.3295183595612783e-05, 'epoch': 1.6}


 54%|█████▍    | 5700/10485 [3:03:02<2:17:45,  1.73s/it]

{'loss': 1.2328, 'grad_norm': 1.1316378116607666, 'learning_rate': 2.2818311874105867e-05, 'epoch': 1.63}


 55%|█████▌    | 5800/10485 [3:05:56<2:12:29,  1.70s/it]

{'loss': 1.1988, 'grad_norm': 1.1029574871063232, 'learning_rate': 2.2341440152598954e-05, 'epoch': 1.66}


 56%|█████▋    | 5900/10485 [3:08:49<2:14:29,  1.76s/it]

{'loss': 1.2245, 'grad_norm': 0.9594400525093079, 'learning_rate': 2.1864568431092035e-05, 'epoch': 1.69}


 57%|█████▋    | 6000/10485 [3:11:52<2:14:07,  1.79s/it]

{'loss': 1.2186, 'grad_norm': 0.8487704396247864, 'learning_rate': 2.1387696709585122e-05, 'epoch': 1.72}


 58%|█████▊    | 6100/10485 [3:14:51<2:09:13,  1.77s/it]

{'loss': 1.2073, 'grad_norm': 1.195090651512146, 'learning_rate': 2.091082498807821e-05, 'epoch': 1.75}


 59%|█████▉    | 6200/10485 [3:18:41<2:44:35,  2.30s/it]

{'loss': 1.2201, 'grad_norm': 1.038196325302124, 'learning_rate': 2.0433953266571293e-05, 'epoch': 1.77}


 60%|██████    | 6300/10485 [3:24:33<6:47:27,  5.84s/it]

{'loss': 1.2099, 'grad_norm': 1.0752381086349487, 'learning_rate': 1.995708154506438e-05, 'epoch': 1.8}


 61%|██████    | 6400/10485 [3:32:01<5:17:57,  4.67s/it]

{'loss': 1.2126, 'grad_norm': 1.008446455001831, 'learning_rate': 1.9480209823557465e-05, 'epoch': 1.83}


 62%|██████▏   | 6500/10485 [3:39:32<4:32:11,  4.10s/it]

{'loss': 1.1781, 'grad_norm': 1.0756773948669434, 'learning_rate': 1.900333810205055e-05, 'epoch': 1.86}


 63%|██████▎   | 6600/10485 [3:48:24<7:49:45,  7.26s/it]

{'loss': 1.179, 'grad_norm': 0.9816827178001404, 'learning_rate': 1.8526466380543633e-05, 'epoch': 1.89}


 63%|██████▎   | 6641/10485 [3:52:25<6:16:10,  5.87s/it]

KeyboardInterrupt: 

In [None]:
# Saving custom model
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\custom_data_saver-2"
custom_model.save_custom(model_save_path)

# Loading custom model
custom_model_2 = CustomMarianMTModel.from_custom(model_save_path, config=custom_model.config, biomedical_encoder=custom_model.biomedical_encoder, hidden_size=512, special_token_size=special_token_size)


In [5]:
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\before-training"

custom_model, tokenizer = CustomMarianMTModel.from_custom(model_save_path)

tokenized_dataset = load_dataset("parquet", data_files="C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\tokenized\\zh-en-tokenized-train-working-model.parquet")["train"]

# Before training
# Move all components to GPU before training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)
# tokenizer = tokenizer.to(device)

# Add explicit error checking
torch.cuda.empty_cache()  # Clear GPU memory before training

# Fine-tune the model
fine_tune_custom_model(
    custom_model, 
    tokenizer, 
    tokenized_dataset, 
    output_dir
)

  biomedicalEncoder = BiomedicalEncoder(custom_config["hidden_size"], custom_config["special_token_size"]).load_state_dict(torch.load(biomedical_encoder_path))
  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maalin[0m ([33maalin-uc-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/10485 [00:00<?, ?it/s]

Error in forward method: '_IncompatibleKeys' object is not callable


TypeError: '_IncompatibleKeys' object is not callable

In [19]:
dataset = load_dataset("parquet", data_files="C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\tokenized\\zh-en-tokenized-train-working-model.parquet")["train"]
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'entity_ids'],
    num_rows: 62127
})

In [3]:
# Saving custom model
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\before-training"
custom_model.save_custom(model_save_path)

# Loading custom model
custom_model_base, tokenizer_base = CustomMarianMTModel.from_custom(model_save_path)


C:\Users\Gaming\Documents\GitHub\MIE2\2024-fall-assignment-linaaron88\project\NER-model\before-training\model


  biomedicalEncoder = BiomedicalEncoder(custom_config["hidden_size"], custom_config["special_token_size"]).load_state_dict(torch.load(biomedical_encoder_path))


In [None]:
# Saving custom model
model_save_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\latest-trained"
custom_model.save_custom(model_save_path)

# Loading custom model
custom_model_2, tokenizer_2 = CustomMarianMTModel.from_custom(model_save_path, config=custom_model.config, biomedical_encoder=custom_model.biomedical_encoder, hidden_size=512, special_token_size=special_token_size)

In [2]:
tokenized_dataset = load_dataset("parquet",data_files="nejm/tokenized/zh-en-tokenized-train-working-model.parquet")["train"]
# Before training
# Move all components to GPU before training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)
# tokenizer = tokenizer.to(device)

# Add explicit error checking
torch.cuda.empty_cache()  # Clear GPU memory before training

# Fine-tune the model
fine_tune_custom_model(
    custom_model, 
    tokenizer, 
    tokenized_dataset, 
    output_dir
)

NameError: name 'custom_model' is not defined

In [None]:
tokenized_dataset.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\tokenized\\zh-en-tokenized-train-working-model.parquet")

In [None]:
# Save the tokenizer with special tokens
tokenizer.save_pretrained("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\pre-training-tokenizer")

# Save the model (with added modifications like entity embeddings)
custom_model.save_pretrained("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\NER-model\\pre-training-model")


In [None]:
tokenized_input = tokenized_dataset['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(tokenized_input)

In [None]:
tokens[5:10]

In [None]:
import optuna
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd

# fine_tuned_model_path = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\fine_tuned_marian_model-zh-en/checkpoint-10485"
# fine_tuned_model_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\custom_fine_tuned_marianmt\\first_run\\checkpoint-10485"

# tokenizer = MarianTokenizer.from_pretrained(fine_tuned_model_path)
# model = MarianMTModel.from_pretrained(fine_tuned_model_path)

tokenizer = None
model = None

# Hyperparameter tuning objective function
def objective(trial):
    global model, tokenizer
    # Suggest hyperparameters to tune
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
    batch_size = trial.suggest_int('batch_size', 8, 32, step=8)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)

    # Load preprocessed entities
    named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
    named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
    special_tokens = [f"<{entity}>" for entity in named_entities]

    # Load tokenizer and custom model
    hidden_size = 512
    
    # Initialize model and tokenizer
    model_name = "Helsinki-NLP/opus-mt-zh-en"  # Change as needed
    tokenizer, model = load_marian_with_biomedical_layer(model_name, hidden_size, special_tokens)

    # Prepare dataset
    tokenized_dataset = prepare_dataset("your_dataset.parquet", tokenizer)

    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    output_dir = f"C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\lr_{learning_rate}_batch-size_{batch_size}_epochs_{num_train_epochs}"


    # Training arguments with suggested hyperparameters
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        save_strategy="epoch",
        save_safetensors = False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
    )

    # Trainer setup
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation set (using BLEU as an example)
    eval_results = trainer.evaluate()
    bleu_score = eval_results["eval_bleu"]

    # Return the negative BLEU score because Optuna minimizes the objective
    return -bleu_score

# Run Optuna optimization
study = optuna.create_study(direction='minimize')  # We want to minimize the negative BLEU score
study.optimize(objective, n_trials=10)

# Print best trial
print(f"Best trial: {study.best_trial.params}")


In [None]:
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")

# Evaluate Performance

In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer

def load_trained_model_and_tokenizer(model_dir, hidden_size, special_tokens):
    """
    Load the trained model and tokenizer.

    :param model_dir: Directory where the model checkpoint is stored.
    :param hidden_size: The hidden size used for the biomedical encoder.
    :param special_tokens: A list of special tokens used during training.
    :return: The tokenizer and custom trained model.
    """
    # Load the tokenizer and add special tokens
    tokenizer = MarianTokenizer.from_pretrained(model_dir)
    # tokenizer.add_special_tokens({
    #     'additional_special_tokens': list(set(special_tokens))
    # })

    # Load the model architecture and the trained weights
    model = MarianMTModel.from_pretrained(model_dir)
    
    # Create biomedical encoder (same as during training)
    special_token_size = len(special_tokens)
    biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
    
    # Create custom model
    custom_model = CustomMarianMTModel(
        model.config, 
        hidden_size, 
        special_token_size, 
        biomedical_encoder
    )

    # Resize token embeddings based on the tokenizer size
    custom_model.resize_token_embeddings(len(tokenizer))

    # Load the trained weights into the model
    checkpoint_path = os.path.join(model_dir, "pytorch_model.bin")
    checkpoint = torch.load(checkpoint_path)

    # Load state dict with strict=False in case there are any mismatches
    custom_model.load_state_dict(checkpoint, strict=False)
    
    # Return the tokenizer and the trained model
    return tokenizer, custom_model

# Example usage
model_directory = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2\\checkpoint-10485"
hidden_size = 512  # Use the hidden size from training
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]

tokenizer, custom_model = load_trained_model_and_tokenizer(model_directory, hidden_size, special_tokens)

# Optionally, move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model.to(device)


In [None]:
hidden_size = 512
special_token_size = len(special_tokens)
biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)
model = MarianMTModel.from_pretrained("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2\\checkpoint-10485")
model = CustomMarianMTModel(config=model.config, hidden_size=512, special_token_size=special_token_size, biomedical_encoder=biomedical_encoder).from_pretrained(model_directory,config=model.config, hidden_size=512, special_token_size=special_token_size, biomedical_encoder=biomedical_encoder)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MarianConfig, MarianMTModel
import pandas as pd

# Recreate the model architecture
hidden_size = 512  # Adjust this to match the original hidden size
named_entities_df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")
named_entities = ["".join(x) for x in named_entities_df["tokens"].tolist()]
special_tokens = [f"<<{entity}>>" for entity in named_entities]
special_token_size = len(special_tokens)  # Ensure you have the special tokens list
biomedical_encoder = BiomedicalEncoder(hidden_size, special_token_size)

base_model_name = "Helsinki-NLP/opus-mt-zh-en"
config = MarianConfig.from_pretrained(base_model_name)

custom_model = CustomMarianMTModel(
    config,  # Ensure you have access to the original config
    hidden_size, 
    special_token_size, 
    biomedical_encoder
)
custom_model.resize_token_embeddings(118379)

# Load the state dict
model_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\custom_fine_tuned_marianmt\\embeddings-2\\checkpoint-10485"
checkpoint = torch.load(model_path + "\\pytorch_model.bin")
custom_model.load_state_dict(checkpoint)

# Move to GPU if needed
custom_model = custom_model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from evaluate import load
import torch

def add_special_tokens(tokenizer, entities):
    """
    Adds new entity tokens to the tokenizer if they are not already present.
    
    Args:
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to update.
        entities (list of str): List of entity names to add as special tokens.
        
    Returns:
        None
    """
    special_tokens = [f"<<{entity}>>" for entity in entities]
    added_tokens = [token for token in special_tokens if token not in tokenizer.get_vocab()]
    if added_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': added_tokens})
        print(f"Added new special tokens: {added_tokens}")


def translate_tokenized_dataset(model, tokenizer, tokenized_dataset, batch_size=32):
    translations = []
    
    model.eval()

    for i in range(0, len(tokenized_dataset), batch_size):
        # Extract batch data
        input_ids = tokenized_dataset["input_ids"][i:i + batch_size]
        attention_mask = tokenized_dataset["attention_mask"][i:i + batch_size]
        entity_ids = tokenized_dataset["entity_ids"][i:i + batch_size]

        # Convert to tensors with explicit type and device handling
        input_ids = torch.tensor(input_ids, dtype=torch.long).to(model.device)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(model.device)
        entity_ids = torch.tensor(entity_ids, dtype=torch.long).to(model.device)

        # Debug print statements
        print(f"Batch {i//batch_size + 1}:")
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention Mask shape: {attention_mask.shape}")
        print(f"Entity IDs shape: {entity_ids.shape}")
        print(f"Entity IDs min: {entity_ids.min()}, max: {entity_ids.max()}")
        print(f"Model entity embedding size: {model.entity_embedding.num_embeddings}")

        # Validate entity_ids before generation
        try:
            # Check if all entity IDs are within the valid range
            assert torch.all(entity_ids >= 0), "Negative entity IDs found"
            assert torch.all(entity_ids < model.entity_embedding.num_embeddings), "Out-of-bound entity IDs"
        except AssertionError as e:
            print(f"Entity ID validation error: {e}")
            # Skip this batch or handle the error as needed
            continue

        # Generate translations
        try:
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    entity_ids=entity_ids
                )
        except Exception as e:
            print(f"Generation error in batch {i//batch_size + 1}: {e}")
            continue

        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)

    return translations


# Define evaluation metrics
def evaluate_model(predictions, references):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    # The expected format for BLEU, ROUGE, and BERTScore is a list of lists of strings
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    
    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)
    
    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
    
    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)
    
    return {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_result,
        "TER": ter_result,
    }


In [None]:
def preprocess_test_data(test_dataset, tokenizer, src_lang="chinese", max_entities=5):
    """
    Preprocess test data to tokenize inputs and add entity_ids for entity-based embeddings.
    """
    def preprocess_function(examples):
        src_sentences = examples[src_lang]
        entities_list = examples.get("entities", [[] for _ in src_sentences])
        
        processed_src = []
        processed_entities = []
        
        for sentence, entities in zip(src_sentences, entities_list):
            # Process source sentence (add markers for entities in vocabulary)
            for entity in entities:
                if f"<<{entity}>>" in tokenizer.get_vocab():
                    sentence = sentence.replace(entity, f"<<{entity}>>")
            processed_src.append(sentence)
            
            # Convert entities to token IDs (if in vocab)
            entity_ids = [
                tokenizer.convert_tokens_to_ids(f"<<{entity}>>") 
                if f"<<{entity}>>" in tokenizer.get_vocab() else 0
                for entity in entities
            ]
            
            # Pad or truncate entity_ids
            entity_ids = entity_ids[:max_entities]
            entity_ids += [0] * (max_entities - len(entity_ids))  # Pad with zeros
            processed_entities.append(entity_ids)
        
        # Tokenize the processed source sentences
        model_inputs = tokenizer(
            processed_src,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )
        
        # Add entity_ids as a tensor to the inputs
        model_inputs["entity_ids"] = torch.tensor(processed_entities, dtype=torch.long)
        
        return model_inputs
    
    # Apply the preprocessing function to the test dataset
    processed_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
    )
    
    return processed_test_dataset


def evaluate_model(model, tokenized_test_dataset, tokenizer, batch_size=16):
    """
    Evaluate the model on the tokenized test dataset.
    """
    # Prepare DataLoader for test data
    test_loader = torch.utils.data.DataLoader(
        tokenized_test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: tokenizer.pad(batch, return_tensors="pt")
    )
    
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            # Move inputs to GPU if available
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
            
            # Generate predictions
            outputs = model.generate(**inputs)
            predictions.extend(outputs)
    
    return predictions


In [None]:
# Load your test dataset
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_test_entities.parquet"})["test"]

# Preprocess and tokenize test data
tokenized_test_dataset = preprocess_test_data(test_dataset, tokenizer)

# Evaluate model on test data
predictions = evaluate_model(custom_model, tokenized_test_dataset, tokenizer)
predictions[0]

Map: 100%|██████████| 2102/2102 [00:00<00:00, 7016.02 examples/s]


RuntimeError: CUDA error: unknown error
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Decode predictions to text
decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
print("Predictions:", decoded_predictions[0:5])

In [None]:
evaluate_model(predictions, test_dataset["test"]["english"])

In [32]:
tokenized_test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2102
})

In [34]:
# Decode predictions safely
decoded_predictions = [
    tokenizer.decode(pred.cpu().numpy(), skip_special_tokens=True)
    for pred in predictions
]
# print("Predictions:", decoded_predictions)


RuntimeError: CUDA error: unknown error
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
outputs = pd.DataFrame(data={"english": test_dataset["test"]["english"], "predicted_english": predictions})
outputs.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions\\zh-en-test-working-model.parquet")

In [None]:
# Sample sentences for inference
test_sentences = [
    "ATP is a molecule important for energy transfer.",
    "The enzyme DNA polymerase synthesizes DNA molecules."
]

# Tokenize the test data
tokenized_test_data = tokenize_data(tokenizer, test_sentences, named_entities)

# Run inference
with torch.no_grad():
    outputs = model.generate(special_token_dataset)

# Decode predictions
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(decoded_outputs)


In [None]:
tokenized_dataset

In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, load_dataset
import pandas as pd
import torch
# fine_tuned_model_path = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\fine_tuned_marian_model-zh-en/checkpoint-10485"
# fine_tuned_model_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\custom_fine_tuned_marianmt\\first_run\\checkpoint-10485"
# custom_model_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\custom_fine_tuned_marianmt\\fixed_entities\\checkpoint-10485"

# tokenizer = MarianTokenizer.from_pretrained(custom_model_path)
# custom_model = MarianMTModel.from_pretrained(custom_model_path)

test_entities = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-test-entities.parquet")
test_named_entities = ["".join(x) for x in test_entities["tokens"].tolist()]
special_token_size = len(test_named_entities)

add_special_tokens(tokenizer=tokenizer_2, entities=test_named_entities)

# Load dataset
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_test_entities.parquet"})["test"]


In [None]:
# Translate Chinese sentences to English
# tokenizer, custom_model
special_token_dataset = prepare_dataset(test_dataset, tokenizer_2, src_lang="chinese", tgt_lang="english", max_entities=5)

In [None]:
predicted_english = translate_tokenized_dataset(model.to("cuda"), tokenizer_2, special_token_dataset)

predictions = pd.DataFrame(data={"english": test_dataset["test"]["english"], "predicted_english": predicted_english})
predictions.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions\\zh-en-test-working-model.parquet")

In [None]:
# Replace these file paths with your actual text files
# parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_test.parquet"
test_entities_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-test-entities.parquet"
test_entities = pd.read_parquet(test_entities_file)

add_special_tokens(tokenizer=None)

# Load dataset
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_train_entities.parquet"})["test"]

# Translate Chinese sentences to English
df["predicted_english"] = translate_sentences(tokenizer, custom_model.to("cuda"), df["chinese"].tolist())
# df.to_parquet("/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\predictions/fine-tuned.parquet")
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions/fine-tuned_zh-en.parquet")

In [None]:
# Call the evaluation function
evaluation_results = evaluate_model(df["predicted_english"], df["english"])

# Print evaluation results
print("Evaluation results:", evaluation_results)

# Fine-tuned models

## zh-en

In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd
import torch
# Load the pretrained MarianNMT model and tokenizer
def load_marian_model_and_tokenizer(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Prepare dataset
def prepare_dataset(parquet_file, tokenizer, src_lang="chinese", tgt_lang="english"):
    # Load dataset from Parquet
    df = pd.read_parquet(parquet_file)

    # Tokenization function
    def preprocess_function(examples):
        model_inputs = tokenizer(
            examples[src_lang],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        labels = tokenizer(
            examples[tgt_lang],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Convert pandas DataFrame to Hugging Face Dataset
    hf_dataset = Dataset.from_pandas(df)
    tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)
    return tokenized_dataset

# Train the model
def fine_tune_model(model, tokenizer, tokenized_dataset, output_dir):
    # Split dataset into train and validation
    dataset = tokenized_dataset.train_test_split(test_size=0.1)

    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        predict_with_generate=True,
        push_to_hub=False,
    )

    

    # Trainer setup
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()


In [None]:
# Main execution
if __name__ == "__main__":
    # Define model and file paths
    model_name = "Helsinki-NLP/opus-mt-zh-en"  # Adjust for en-zh if needed
    # parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_train.parquet"
    # output_dir = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\fine_tuned_marian_model-zh-en"
    parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet"
    output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\fine_tuned_marian_model-zh-en"

    # Load the model and tokenizer
    tokenizer, model = load_marian_model_and_tokenizer(model_name)

    # Prepare the dataset
    tokenized_dataset = prepare_dataset(parquet_file, tokenizer)

    # Fine-tune the model
    fine_tune_model(model, tokenizer, tokenized_dataset, output_dir)


In [None]:
pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")

## en-zh

In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# Main execution
if __name__ == "__main__":
    # Define model and file paths
    model_name = "Helsinki-NLP/opus-mt-en-zh"  # Adjust for en-zh if needed
    # parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_train.parquet"
    # output_dir = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\fine_tuned_marian_model-en-zh"
    parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet"
    output_dir = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\fine_tuned_marian_model-en-zh"

    # Load the model and tokenizer
    tokenizer, model = load_marian_model_and_tokenizer(model_name)

    # Prepare the dataset
    tokenized_dataset = prepare_dataset(parquet_file, tokenizer, src_lang="english", tgt_lang="chinese")

    # Fine-tune the model
    fine_tune_model(model, tokenizer, tokenized_dataset, output_dir)


# Evaluate BLEU Score on fine-tuned model

## zh-en

In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd
import torch
# fine_tuned_model_path = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\fine_tuned_marian_model-zh-en/checkpoint-10485"
fine_tuned_model_path = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\fine_tuned_marian_model-zh-en/checkpoint-10485"

tokenizer = MarianTokenizer.from_pretrained(fine_tuned_model_path)
model = MarianMTModel.from_pretrained(fine_tuned_model_path)


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import sacrebleu
import pandas as pd

def load_marian_model(model_name):
    # Load the MarianMT model and tokenizer
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

def add_special_tokens(tokenizer, entities):
    """
    Adds new entity tokens to the tokenizer if they are not already present.

    Args:
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to update.
        entities (list of str): List of entity names to add as special tokens.

    Returns:
        None
    """
    special_tokens = [f"<<{entity}>>" for entity in entities]
    added_tokens = [token for token in special_tokens if token not in tokenizer.get_vocab()]
    if added_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': added_tokens})
        print(f"Added new special tokens: {added_tokens}")


def translate_sentences(tokenizer, model, sentences, batch_size=32):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
        outputs = model.generate(**inputs)
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
        translations.extend(translated_batch)
    return translations

def calculate_bleu(predictions, references):
    # Use sacrebleu to calculate the BLEU score
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

In [None]:
# Replace these file paths with your actual text files
# parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_test.parquet"
parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet"

df = pd.read_parquet(parquet_file)

# Translate Chinese sentences to English
df["predicted_english"] = translate_sentences(tokenizer, model.to("cuda"), df["chinese"].tolist())
# df.to_parquet("/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\predictions/fine-tuned.parquet")
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions/fine-tuned_zh-en.parquet")

In [None]:
# Calculate BLEU score
bleu_score = calculate_bleu(df["predicted_english"].tolist(), df["english"].tolist())

print(f"BLEU Score: {bleu_score}")

In [None]:
test_dataset = load_dataset("parquet", data_files={"test": "nejm/nejm_train_entities.parquet"})

In [None]:
test_dataset

## en-zh

In [None]:
# Replace these file paths with your actual text files
# parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_test.parquet"
parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet"

df = pd.read_parquet(
    parquet_file)

# Translate Chinese sentences to English
df["predicted_chinese"] = translate_sentences(tokenizer, model.to("cuda"), df["english"].tolist())
# df.to_parquet("/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\predictions/fine-tuned.parquet")
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions/fine-tuned_en-zh.parquet")

In [None]:
# Calculate BLEU score
bleu_score = calculate_bleu(df["predicted_chinese"].tolist(), df["chinese"].tolist())

print(f"BLEU Score: {bleu_score}")

In [None]:
df.head()

# Evaluate BLEU Score on Baseline Models

## zh-en

In [None]:
# Replace these file paths with your actual text files
# parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_test.parquet"
parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet"

df = pd.read_parquet(parquet_file)

# Load MarianNMT pretrained model for zh-en translation
baseline_model_name = "Helsinki-NLP/opus-mt-zh-en"
baseline_tokenizer, baseline_model = load_marian_model(baseline_model_name)

# Translate Chinese sentences to English
df["predicted_english"] = translate_sentences(baseline_tokenizer, baseline_model.to("cuda"), df["chinese"].tolist())
# df.to_parquet("/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\predictions/fine-tuned.parquet")
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions\\baseline_zh-en.parquet")

In [None]:
# Calculate BLEU score
bleu_score = calculate_bleu(df["predicted_english"].tolist(), df["english"].tolist())

print(f"BLEU Score: {bleu_score}")

In [None]:
df.head()

## en-zh

In [None]:
# Replace these file paths with your actual text files
# parquet_file = "/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\nejm_test.parquet"
parquet_file = "C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_test.parquet"

df = pd.read_parquet(parquet_file)

# Load MarianNMT pretrained model for en-zh translation
baseline_model_name = "Helsinki-NLP/opus-mt-en-zh"
baseline_tokenizer, baseline_model = load_marian_model(baseline_model_name)

# Translate Chinese sentences to English
df["predicted_chinese"] = translate_sentences(baseline_tokenizer, baseline_model.to("cuda"), df["english"].tolist())
# df.to_parquet("/content/drive/MyDrive/Colab Notebooks/corpora/nejm\\predictions/fine-tuned.parquet")
df.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\predictions/baseline_en-zh.parquet")

In [None]:
# Calculate BLEU score
bleu_score = calculate_bleu(df["predicted_chinese"].tolist(), df["chinese"].tolist())

print(f"BLEU Score: {bleu_score}")

In [None]:
df.head()

# NER using pretrained models

In [None]:
# https://huggingface.co/lixin12345/chinese-medical-ner
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

class NER:
    """
    实体命名实体识别
    """
    def __init__(self,model_path) -> None:
        """
        Args:
            model_path:模型地址
        """

        self.model_path = model_path
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)

    def ner(self,sentence:str) -> list:
        """
        命名实体识别
        Args:
            sentence:要识别的句子
        Return:
            实体列表:[{'type':'LOC','tokens':[...]},...]
        """
        ans = []
        for i in range(0,len(sentence),500):
            ans = ans + self._ner(sentence[i:i+500])
        return ans
    
    def _ner(self,sentence:str) -> list:
        if len(sentence) == 0: return []
        inputs = self.tokenizer(
            sentence, add_special_tokens=True, return_tensors="pt"
        )
        
        if torch.cuda.is_available():
            self.model = self.model.to(torch.device('cuda:0'))
            for key in inputs:
                inputs[key] = inputs[key].to(torch.device('cuda:0'))
            
        with torch.no_grad():
            logits = self.model(**inputs).logits
        predicted_token_class_ids = logits.argmax(-1)
        predicted_tokens_classes = [self.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        entities = []
        entity = {}
        for idx, token in enumerate(self.tokenizer.tokenize(sentence,add_special_tokens=True)):
            if 'B-' in predicted_tokens_classes[idx] or 'S-' in predicted_tokens_classes[idx]:
                if len(entity) != 0:
                    entities.append(entity)
                entity = {}
                entity['type'] = predicted_tokens_classes[idx].replace('B-','').replace('S-','')
                entity['tokens'] = [token]
            elif 'I-' in predicted_tokens_classes[idx] or 'E-' in predicted_tokens_classes[idx] or 'M-' in predicted_tokens_classes[idx]:
                if len(entity) == 0:
                    entity['type'] = predicted_tokens_classes[idx].replace('I-','').replace('E-','').replace('M-','')
                    entity['tokens'] = []
                entity['tokens'].append(token)
            else:
                if len(entity) != 0:
                    entities.append(entity)
                    entity = {}
        if len(entity) > 0:
            entities.append(entity)
        return entities

ner_model = NER('lixin12345/chinese-medical-ner')
text = """
患者既往慢阻肺多年;冠心病史6年，平素规律服用心可舒、保心丸等控制可;双下肢静脉血栓3年，保守治疗效果可;左侧腹股沟斜疝无张力修补术后2年。否认"高血压、糖尿病"等慢性病病史，否认"肝炎、结核"等传染病病史及其密切接触史，否认其他手术、重大外伤、输血史，否认"食物、药物、其他"等过敏史，预防接种史随社会。
"""
ans = ner_model.ner(text)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
texts = df.chinese.tolist()
# Extract entities
chinese_entities = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = ner_model.ner(text)
    for entity in entities:
        # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
        chinese_entities.append(entity)
df_entities = pd.DataFrame(chinese_entities)

In [None]:
df_entities.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\zh-entities.parquet")

In [None]:
df_entities.head()

In [None]:
df_entities.type.unique()

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")


In [None]:
from transformers import pipeline

# Load the model
model_path = "venkatd/BIOMed_NER"
pipe = pipeline(
    task="token-classification",
    model=model_path,
    tokenizer=model_path,
    aggregation_strategy="simple"
)

# Test the pipeline
text = ("A 48-year-old female presented with vaginal bleeding and abnormal Pap smears. "
        "Upon diagnosis of invasive non-keratinizing SCC of the cervix, she underwent a radical "
        "hysterectomy with salpingo-oophorectomy which demonstrated positive spread to the pelvic "
        "lymph nodes and the parametrium.")
result = pipe(text)
print(result)


In [None]:
from transformers import pipeline
import pandas as pd
import tqdm
import os

# directory = os.path.join()

# Sample biomedical text in Chinese
df = pd.read_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\nejm_train.parquet")
texts = df.english.tolist()
# Extract entities
english_entities = []
for text in tqdm.tqdm(texts):
    # print(f"Text: {text}")
    entities = pipe(text)
    for entity in entities:
        # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
        english_entities.append(entity)
df_entities_en = pd.DataFrame(english_entities)

In [None]:
df_entities_en.head()

In [None]:
df_entities_en.to_parquet("C:\\Users\\Gaming\\Documents\\GitHub\\MIE2\\2024-fall-assignment-linaaron88\\project\\nejm\\en-entities.parquet")

In [None]:
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")

# Knowledge graphs (Unused)

In [None]:
def construct_translation_kg(entity_pairs):
    translation_kg = []
    for source_entity, target_entity in entity_pairs:
        triple = (source_entity, "<t>", target_entity)
        translation_kg.append(triple)
    return translation_kg


In [None]:
def unify_knowledge_graph(Ks, Kt, Kp):
    return {"Ks": Ks, "Kt": Kt, "Kp": Kp}


In [None]:
def match_sentence_to_kg(sentence, kg):
    # Extract local KG for a sentence
    local_kg = []
    for entity in extract_entities(sentence, language="source"):
        for triple in kg:
            if entity in triple:  # Match based on entity presence
                local_kg.append(triple)
    return local_kg

def extract_local_knowledge_graphs(sentence, unified_kg):
    Ks_local = match_sentence_to_kg(sentence, unified_kg["Ks"])
    Kp_local = match_sentence_to_kg(sentence, unified_kg["Kp"])
    
    # Match tail entities in Kp_local to Kt
    tail_entities = [triple[2] for triple in Kp_local]  # Extract tail entities
    Kt_local = [triple for triple in unified_kg["Kt"] if triple[0] in tail_entities]
    
    return {"Ks_local": Ks_local, "Kp_local": Kp_local, "Kt_local": Kt_local}


In [None]:
Ks = [...]  # Triples from source KG (e.g., YAGO)
Kt = [...]  # Triples from target KG (e.g., CN-DBpedia)
Kp = construct_translation_kg(entity_pairs)  # Entity pairs (from IBM model)

unified_kg = unify_knowledge_graph(Ks, Kt, Kp)


In [None]:
sentence = "咖啡因可能导致心悸和减重"
local_kg = extract_local_knowledge_graphs(sentence, unified_kg)

print(local_kg["Ks_local"])  # Local Source KG
print(local_kg["Kp_local"])  # Local Translation KG
print(local_kg["Kt_local"])  # Local Target KG


## Generate translation knowledge graph Kp

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Load the Parquet file with the parallel corpus directly into Pandas
df_pandas = pd.read_parquet('path_to_your_parquet_file')

# Example of your dataframe: columns "source" (Chinese) and "target" (English)
# Let's inspect the first few rows to see the data format
print(df_pandas.head())

# Tokenize the sentences in the source (Chinese) and target (English)
nltk.download('punkt')  # If you don't have the Punkt tokenizer models already
df_pandas['source_tokens'] = df_pandas['source'].apply(lambda x: word_tokenize(x))
df_pandas['target_tokens'] = df_pandas['target'].apply(lambda x: word_tokenize(x))

# Create Translation Knowledge Graph (Kp) from word-level alignments
def create_translation_triples(row):
    source_tokens = row['source_tokens']
    target_tokens = row['target_tokens']

    translation_triples = []
    for s, t in zip(source_tokens, target_tokens):
        # Assuming word alignment: (source_word, "<t>", target_word)
        translation_triples.append((s, "<t>", t))

    return translation_triples

# Apply the function to create translation triples for each sentence pair
df_pandas['translation_triples'] = df_pandas.apply(create_translation_triples, axis=1)

# Flatten the translation triples and convert to a list of (source, relation, target)
all_triples = []
for triples in df_pandas['translation_triples']:
    all_triples.extend(triples)

# Convert the list of triples into a pandas DataFrame for easier storage/analysis
triples_df = pd.DataFrame(all_triples, columns=['source_word', 'relation', 'target_word'])

# Example: show the first few translation triples
print(triples_df.head())

# Now you have the translation knowledge graph Kp
# You can save this knowledge graph as a CSV or other formats for use in your model
triples_df.to_csv('translation_knowledge_graph.csv', index=False)

# Or, you can store it in a more structured format like a dictionary or a custom KG
# Example: saving the knowledge graph as a list of dictionaries
kp_dict = triples_df.to_dict(orient='records')
print(kp_dict[:5])  # Print first 5 entries of the dictionary format

# The Kp can now be passed into the model for further processing


# Knowledge Guided Transformer for NMT

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

class KnowledgeEncoder(nn.Module):
    """
    This custom class encodes knowledge from KGs to be passed into a transformer model alongside
    """
    def __init__(self, hidden_size, num_layers):
        super(KnowledgeEncoder, self).__init__()
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=8), num_layers=num_layers
        )

    def forward(self, knowledge_triples):
        """
        knowledge_triples: The encoded knowledge triples (Ksel) passed as input
        to the knowledge encoder.
        """
        return self.transformer_encoder(knowledge_triples)

class KnowledgeAttention(nn.Module):
    def __init__(self, hidden_size):
        super(KnowledgeAttention, self).__init__()
        self.query_linear = nn.Linear(hidden_size, hidden_size)
        self.key_linear = nn.Linear(hidden_size, hidden_size)
        self.value_linear = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value):
        """
        query, key, value: hidden states from the source encoder and knowledge encoder.
        """
        q = self.query_linear(query)
        k = self.key_linear(key)
        v = self.value_linear(value)

        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / (key.size(-1) ** 0.5)
        attention_weights = self.softmax(attention_scores)
        return torch.matmul(attention_weights, v)

class KnowledgeGuidedMTModel(nn.Module):
    def __init__(self, pretrained_model_name, hidden_size=512, num_layers=6):
        super(KnowledgeGuidedMTModel, self).__init__()
        self.transformer = MarianMTModel.from_pretrained(pretrained_model_name)
        self.tokenizer = MarianTokenizer.from_pretrained(pretrained_model_name)

        # Knowledge Encoder
        self.knowledge_encoder = KnowledgeEncoder(hidden_size=hidden_size, num_layers=num_layers)

        # Knowledge Attention mechanism
        self.knowledge_attention = KnowledgeAttention(hidden_size=hidden_size)

    def forward(self, input_ids, knowledge_triples, decoder_input_ids=None, attention_mask=None):
        """
        input_ids: Tokenized source sentence.
        knowledge_triples: Knowledge triples (Ksel) in the form of token embeddings.
        decoder_input_ids: Target sentence (used in training with teacher forcing).
        attention_mask: Attention mask for the encoder-decoder.
        """
        # Source sentence encoding using the pretrained MarianMT model
        source_encoder_output = self.transformer.encoder(input_ids=input_ids, attention_mask=attention_mask)
        source_hidden_states = source_encoder_output.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Knowledge encoding
        knowledge_hidden_states = self.knowledge_encoder(knowledge_triples)  # (batch_size, num_triples, hidden_size)

        # Decoder input - Knowledge attention mechanism
        if decoder_input_ids is not None:
            decoder_input = self.transformer.decoder(
                input_ids=decoder_input_ids,
                encoder_hidden_states=source_hidden_states,
                encoder_attention_mask=attention_mask
            )
            decoder_hidden_states = decoder_input[0]
            query = decoder_hidden_states[-1]  # Last token's hidden state as the query

            # Apply knowledge attention layer
            knowledge_attended = self.knowledge_attention(query, knowledge_hidden_states, knowledge_hidden_states)
            final_hidden_state = decoder_hidden_states + knowledge_attended
        else:
            final_hidden_state = source_hidden_states

        # Generate translation using the final hidden state
        output = self.transformer.lm_head(final_hidden_state)
        return output

    def generate(self, input_text, knowledge_triples):
        input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids
        knowledge_triples = torch.tensor(knowledge_triples)  # Assuming this is tokenized
        generated_ids = self.generate_from_input(input_ids, knowledge_triples)
        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    def generate_from_input(self, input_ids, knowledge_triples):
        # Generate output for the given input, integrating the knowledge graph
        decoder_input_ids = torch.tensor([self.tokenizer.bos_token_id]).unsqueeze(0)  # Start token for the decoder
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

        output = self(input_ids, knowledge_triples, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask)
        logits = output.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        return predicted_ids

# Example usage:
pretrained_model_name = 'Helsinki-NLP/opus-mt-en-de'  # Example pretrained MarianMT model
model = KnowledgeGuidedMTModel(pretrained_model_name)

# Example knowledge triples (source and target language tokens)
knowledge_triples = torch.tensor([[1, 2, 3], [4, 5, 6]])  # These should be tokenized triples (example)

# Input sentence
input_sentence = "This is a test sentence."

# Generate translation
translated = model.generate(input_sentence, knowledge_triples)
print(f"Translated: {translated}")


# Unused WMT code

In [None]:
"""ONLY RUN ONCE: LOAD DATASET AND SAVE TO PARQUET"""
# import os
# import pandas as pd
# import glob

# # Function to load and convert text files into a DataFrame
# def convert_to_parquet(data_dir, suffix_en, suffix_zh, output_file):
#     all_files = glob.glob(os.path.join(data_dir, "*.txt"))
#     data = []

#     for file in all_files:
#         if file.endswith(suffix_en):
#             base_name = file[:-len(suffix_en)]
#             zh_file = base_name + suffix_zh
#             if zh_file in all_files:
#                 # Read file contents
#                 with open(file, 'r', encoding='utf-8') as en_f, open(zh_file, 'r', encoding='utf-8') as zh_f:
#                     en_text = en_f.read().strip()
#                     zh_text = zh_f.read().strip()
#                     data.append({"english": en_text, "chinese": zh_text})

#     # Convert to DataFrame
#     df = pd.DataFrame(data)

#     # Save as Parquet file
#     df.to_parquet(output_file, engine="pyarrow", compression="snappy")
#     print(f"Data saved to {output_file}")

# # Parameters
# data_dir = "data/"
# suffix_en = "_en.txt"
# suffix_zh = "_zh-cn.txt"
# output_file = os.path.join(data_dir, "wmt22_dataset.parquet")

# convert_to_parquet(data_dir, suffix_en, suffix_zh, output_file)


In [None]:
import pandas as pd
import tensorflow as tf

# Load Parquet file into Pandas
parquet_file = os.path.join(data_dir, "wmt22_dataset.parquet")
df = pd.read_parquet(parquet_file)

# Convert DataFrame to TensorFlow Dataset
def pandas_to_tf_dataset(df):
    return tf.data.Dataset.from_tensor_slices((df["english"].values, df["chinese"].values))

dataset = pandas_to_tf_dataset(df)

# Preview the dataset
for en_text, zh_text in dataset.take(5):
    print(f"English: {en_text.numpy().decode('utf-8')}")
    print(f"Chinese: {zh_text.numpy().decode('utf-8')}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSeq2SeqLM.from_pretrained("bert-base-multilingual-cased")


In [None]:
# # Split the data into train (80%), val (10%), test (10%)
# splits = dataset["train"].train_test_split(test_size=0.2, seed=42)
# train_val_split = splits["train"].train_test_split(test_size=0.125, seed=42)  # 10% of original for validation

# # Assign splits
# train_data = train_val_split["train"]
# val_data = train_val_split["test"]
# test_data = splits["test"]

# print(f"Training size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")