In [1]:
# pip install torch --index-url https://download.pytorch.org/whl/cu121

In [2]:
# pip install -r requirements.txt

In [3]:
# Import packages
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from transformers.models.llama.modeling_llama import LlamaModel
from peft import LoraConfig, get_peft_model
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from transformers.trainer_utils import EvalPrediction

In [4]:
def read_conll_file(file_path):
    # Open the file located at 'file_path' in read mode
    with open(file_path, "r") as f:
        # Read the entire file content and remove any leading/trailing whitespace
        content = f.read().strip() 
        # Split the content into sentences, where each sentence is separated by a blank line ("\n\n")
        sentences = content.split("\n\n")   
        # Initialize an empty list to store the parsed data for each sentence
        data = []   
        # Iterate over each sentence in the list of sentences
        for sentence in sentences:
            # Split each sentence into individual tokens (each token is on a new line)
            tokens = sentence.split("\n")    
            # Initialize a list to store the data for each token in the current sentence
            token_data = [] 
            # Iterate over each token in the sentence
            for token in tokens:
                # Split the token into its components (e.g., word, POS tag, etc.)
                token_data.append(token.split())     
            # Append the list of token data for the current sentence to the overall data list
            data.append(token_data) 
    # Return the parsed data as a list of sentences, where each sentence is a list of tokens,
    # and each token is represented as a list of its components (e.g., word, POS tag, etc.)
    return data

In [5]:
# Read data
train_data = read_conll_file("eng.train")
validation_data = read_conll_file("eng.testa")
test_data = read_conll_file("eng.testb")

In [6]:
# Show example
print(train_data[:2])

[[['-DOCSTART-', '-X-', '-X-', 'O']], [['EU', 'NNP', 'B-NP', 'B-ORG'], ['rejects', 'VBZ', 'B-VP', 'O'], ['German', 'JJ', 'B-NP', 'B-MISC'], ['call', 'NN', 'I-NP', 'O'], ['to', 'TO', 'B-VP', 'O'], ['boycott', 'VB', 'I-VP', 'O'], ['British', 'JJ', 'B-NP', 'B-MISC'], ['lamb', 'NN', 'I-NP', 'O'], ['.', '.', 'O', 'O']]]


In [7]:
def convert_to_dataset(data, label_map):
    # Initialize a dictionary to store formatted tokens and NER tags
    formatted_data = {"tokens": [], "ner_tags": []}   
    # Iterate over each sentence in the input data
    for sentence in data:
        # Extract the tokens (first element of each token_data tuple) from the sentence
        tokens = [token_data[0] for token_data in sentence]
        # Convert NER tags (fourth element of each token_data tuple) using label_map
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        # Append the extracted tokens and corresponding NER tags to the formatted_data dictionary
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    
    # Convert the formatted data into a Dataset object and return it
    return Dataset.from_dict(formatted_data)

In [8]:
# Extract unique labels from the training data and sort them alphabetically
label_list = sorted(
    list(set([token_data[3] for sentence in train_data for token_data in sentence]))
)

# Create a mapping from each label to a unique integer index
# This dictionary will map each label (from label_list) to its corresponding index
label_map = {label: i for i, label in enumerate(label_list)}

In [9]:
# Print the list of labels
print(label_list)

# Print the mapping of labels to their corresponding indices or values
print(label_map)

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [10]:
# Create a dictionary `id2label` by reversing the key-value pairs in `label_map`
id2label = {value: key for key, value in label_map.items()}
print(id2label)

{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}


In [11]:
# Convert the training data into a dataset format using a label map for mapping labels
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [12]:
# Create a DatasetDict object to hold multiple datasets
datasets = DatasetDict(
    {
        "train": train_dataset,
        "validation": validation_dataset,
        "test": test_dataset,
    }
)

In [13]:
# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [14]:
# Set random seeds for reproducibility
def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior
    torch.backends.cudnn.benchmark = False     # Disables auto-tuning for convolutional layers

In [15]:
set_seed(42)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Create custom LlamaModel with bidirectional attention
# class LlamaBidirectionalModel(LlamaModel):
#     def _update_causal_mask(self, attention_mask):
#         # Create bidirectional attention mask (all ones)
#         bsz, seq_len = attention_mask.shape
#         mask = torch.ones((bsz, 1, seq_len, seq_len), device=attention_mask.device)
#         return mask

# Initialize model with bidirectional attention
model = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    pad_token_id=tokenizer.eos_token_id,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for model weights
    device_map="auto",  # Automatically map to available devices (e.g., GPU)
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label_map
)

# Replace the base model with bidirectional version
# model.base_model = LlamaBidirectionalModel(model.config)
model.config.is_decoder = False # already bidirectional, but setting in any case

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Verify model device placement
print(f"Model is on device: {next(model.parameters()).device}")

# Optional: Enable CUDA optimizations
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

Model is on device: cuda:0


In [17]:
def compute_metrics(eval_prediction):
    # Unpack the predictions and labels from the evaluation prediction tuple
    predictions, labels = eval_prediction
    
    # Convert the predicted probabilities (or logits) to predicted class indices
    # by taking the argmax along axis 2 (assuming predictions are in a 3D array)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) from the labels and corresponding predictions.
    # This is typically used to ignore special tokens in sequence labeling tasks like token classification.
    
    # true_predictions will hold the predicted labels for tokens that are not ignored (-100)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # true_labels will hold the actual labels for tokens that are not ignored (-100)
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Return a dictionary containing various evaluation metrics:
    # 1. Accuracy: Measures overall correctness of predictions.
    # 2. Precision: Measures how many selected items are relevant.
    # 3. Recall: Measures how many relevant items are selected.
    # 4. F1 Score: Harmonic mean of precision and recall.
    # 5. Classification Report: Detailed report showing precision, recall, f1-score per class.
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }

In [18]:
# Create mock data
mock_predictions = np.random.rand(2, 10, len(label_list))  # Batch size 2, sequence length 10
mock_labels = np.random.randint(0, len(label_list), size=(2, 10))

# Test with tuple input
test_metrics = compute_metrics((mock_predictions, mock_labels))
print("Test with tuple:", test_metrics)

# Test with EvalPrediction object
eval_pred = EvalPrediction(predictions=mock_predictions, label_ids=mock_labels)
test_metrics = compute_metrics(eval_pred)
print("Test with EvalPrediction:", test_metrics)

Test with tuple: {'accuracy': 0.1, 'precision': np.float64(0.07142857142857142), 'recall': np.float64(0.08333333333333333), 'f1': np.float64(0.07692307692307691)}
Test with EvalPrediction: {'accuracy': 0.1, 'precision': np.float64(0.07142857142857142), 'recall': np.float64(0.08333333333333333), 'f1': np.float64(0.07692307692307691)}


In [19]:
def tokenize_and_align_labels(examples):
    # Tokenize the input tokens with truncation and padding. 
    # 'is_split_into_words=True' ensures that words are tokenized individually.
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    
    # Initialize an empty list to store the aligned labels for each example.
    labels = []
    
    # Loop through each example in the dataset
    for i, label in enumerate(examples["ner_tags"]):
        # Get the word IDs for the current example after tokenization.
        # These IDs map tokens back to their corresponding word in the original input.
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None  # Variable to track the previous word index.
        label_ids = []  # List to store label IDs aligned to tokens.

        # Loop through each word index in the tokenized input.
        for word_idx in word_ids:
            if word_idx is None:
                # If the token does not correspond to a word (e.g., special tokens), assign -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # If the token corresponds to a new word (not a subword), assign its label.
                label_ids.append(label[word_idx])
            else:
                # If the token is part of the same word (subword), assign -100 to ignore it.
                label_ids.append(-100)
            
            # Update previous_word_idx to the current word index for comparison in the next iteration.
            previous_word_idx = word_idx
        
        # Append the aligned labels for this example to the labels list.
        labels.append(label_ids)
    
    # Add the aligned labels as a new key "labels" in the tokenized input dictionary.
    tokenized_inputs["labels"] = labels
    
    # Return the tokenized inputs along with their aligned labels.
    return tokenized_inputs

In [20]:
# # Apply the 'tokenize_and_align_labels' function to each example in the dataset using the map() method.
# # - batched=True: This enables processing multiple examples at once (in batches) instead of one by one.
# #   This speeds up the tokenization process, as tokenization libraries like Hugging Face's Tokenizers
# #   can parallelize operations more efficiently when working with batches.
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4730 [00:00<?, ? examples/s]

Map:   0%|          | 0/1406 [00:00<?, ? examples/s]

Map:   0%|          | 0/1630 [00:00<?, ? examples/s]

In [21]:
datasets["test"][0]

{'tokens': ['-DOCSTART-'], 'ner_tags': [8]}

In [22]:
print(tokenized_datasets["test"][0])

{'tokens': ['-DOCSTART-'], 'ner_tags': [8], 'input_ids': [128000, 12, 32564, 23380, 12, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 

In [23]:
def data_collator(data):
    # Extract 'input_ids' from each item in the batch and convert them to tensors
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    
    # Extract 'attention_mask' from each item in the batch and convert them to tensors
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    
    # Extract 'labels' from each item in the batch and convert them to tensors
    labels = [torch.tensor(item["labels"]) for item in data]

    # Pad the 'input_ids' to ensure all sequences in the batch have the same length
    # Use tokenizer's pad token ID as the padding value
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    # Pad the 'attention_mask' to match the padded 'input_ids'
    # Use 0 as the padding value since 0 indicates no attention
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    
    # Pad the 'labels' so that all label sequences have the same length
    # Use -100 as the padding value, which is typically ignored in loss calculations (for masked tokens)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    # Return a dictionary containing the padded input_ids, attention_mask, and labels for the batch
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [24]:
# Extract the tokenized test data from the preprocessed dataset
train_dataset_tokenized = tokenized_datasets["train"]

In [25]:
# Get a small batch of tokenized examples from your dataset
batch = [train_dataset_tokenized[i] for i in range(2)]  # Take 2 examples from the train set

# Pass the batch through the data collator
collated_batch = data_collator(batch)

# Inspect the output
print(collated_batch)

{'input_ids': tensor([[128000,     12,  32564,  23380,     12, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001],
        [128000,  39907,  48096,     82,  33179,   6797,    998,    754,   3418,
           1751,  52961,     75,   3042,     13, 128001, 128001, 128001, 128001,
         1280

In [26]:
# Check number of unique NER tags
unique_labels = set([label for example in datasets['train']["ner_tags"] for label in example])
print(f"Unique NER tags: {unique_labels}")
print(f"Model's number of labels: {model.config.num_labels}")

Unique NER tags: {0, 1, 2, 3, 4, 5, 6, 7, 8}
Model's number of labels: 9


In [27]:
# Check if any label is outside the valid range
for example in datasets['train']["ner_tags"]:
    for label in example:
        if label < 0 or label >= model.config.num_labels:
            raise ValueError(f"Invalid label found: {label}. Expected range: [0, {model.config.num_labels - 1}]")

In [28]:
# Show max content length
print(tokenizer.model_max_length)

# # Set the context window explicitly to 8192 tokens
# ctx_len = 8192
# tokenizer.model_max_length = ctx_len
# model.config.rope_freq_base = (ctx_len / 131_072) * 500_000
# print(model.config.rope_freq_base)

131072


In [29]:
# Define LoRA configuration

lora_config = LoraConfig(
    task_type="TOKEN_CLS",      # Task type for token classification
    r=16,                        # Rank of the low-rank matrices
    lora_alpha=32,              # Scaling factor for LoRA updates
    target_modules=["q_proj", "v_proj"],   # Modules to apply LoRA (typically attention layers)
    lora_dropout=0.1            # Dropout rate for LoRA layers
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,722,377 || all params: 1,237,555,218 || trainable%: 0.1392


In [30]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    seed=42,
    metric_for_best_model="f1",
    weight_decay=0.01,
    bf16=True,
    remove_unused_columns=False
)

In [31]:
# Initialize the Trainer class for model training and evaluation
trainer = Trainer(
    model=model,  # The model to be trained
    train_dataset=tokenized_datasets["train"],  # The tokenized training dataset
    eval_dataset=tokenized_datasets["validation"],  # The tokenized validation dataset for evaluation during training
    data_collator=data_collator,  # A function or object that batches and pads the data
    processing_class=tokenizer,  # The tokenizer used for processing the input text
    compute_metrics=compute_metrics,  # A function to compute metrics during evaluation (e.g., accuracy, F1 score)
    args=training_args  # Training arguments like batch size, number of epochs, learning rate, etc.  
)

In [32]:
# Train the model
trainer.train()

  0%|          | 0/296 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.13980688154697418, 'eval_accuracy': 0.9582425906637629, 'eval_precision': 0.6509207365892714, 'eval_recall': 0.7185152452496686, 'eval_f1': 0.683049779458097, 'eval_runtime': 6.4022, 'eval_samples_per_second': 219.611, 'eval_steps_per_second': 13.745, 'epoch': 1.0}
{'train_runtime': 56.3817, 'train_samples_per_second': 83.893, 'train_steps_per_second': 5.25, 'train_loss': 0.28151617823420344, 'epoch': 1.0}


TrainOutput(global_step=296, training_loss=0.28151617823420344, metrics={'train_runtime': 56.3817, 'train_samples_per_second': 83.893, 'train_steps_per_second': 5.25, 'total_flos': 2516167568139120.0, 'train_loss': 0.28151617823420344, 'epoch': 1.0})

In [33]:
# Save the model
trainer.save_model('./bidirectional_llama32')

In [34]:
# Extract datasets
train_dataset_tokenized = tokenized_datasets["train"]
val_dataset_tokenized = tokenized_datasets["validation"]
test_dataset_tokenized = tokenized_datasets["test"]

In [35]:
# Use the trained model to perform predictions on the tokenized test dataset
# 'predictions' will contain the model's output, 'labels' will contain the true labels,
# and 'metrics' will store evaluation metrics (e.g., accuracy, precision, recall, etc.)
predictions_train, labels_train, metrics_train = trainer.predict(train_dataset_tokenized)
predictions_val, labels_val, metrics_val = trainer.predict(val_dataset_tokenized)
predictions_test, labels_test, metrics_test = trainer.predict(test_dataset_tokenized)

  0%|          | 0/296 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

In [39]:
# Display the performance metrics

print("Training Performance: ",  metrics_train)
print("Validation Performance: ",  metrics_val)
display("Test Performance: ",  metrics_test) 

Training Performance:  {'test_loss': 0.14612407982349396, 'test_accuracy': 0.9582167804223078, 'test_precision': 0.694992041141178, 'test_recall': 0.7393513091051191, 'test_f1': 0.7164857359252715, 'test_runtime': 20.8628, 'test_samples_per_second': 226.719, 'test_steps_per_second': 14.188}
Validation Performance:  {'test_loss': 0.13980688154697418, 'test_accuracy': 0.9582425906637629, 'test_precision': 0.6509207365892714, 'test_recall': 0.7185152452496686, 'test_f1': 0.683049779458097, 'test_runtime': 6.2764, 'test_samples_per_second': 224.015, 'test_steps_per_second': 14.021}


'Test Performance: '

{'test_loss': 0.19931994378566742,
 'test_accuracy': 0.9459214227384085,
 'test_precision': 0.6040452519712033,
 'test_recall': 0.6686907020872865,
 'test_f1': 0.6347262247838616,
 'test_runtime': 10.5956,
 'test_samples_per_second': 153.838,
 'test_steps_per_second': 9.627}

In [37]:
set_seed(42)

# Input sentence
sentence = "Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California."

# Tokenize without adding special tokens
tokenized_input = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).to(model.device)

# Get model outputs
outputs = model(**tokenized_input)

# Get predicted labels (argmax over logits)
predicted_labels = outputs.logits.argmax(-1)[0]

# Inverted label map (assuming label_map is defined elsewhere)
label_map_inverted = {v: k for k, v in label_map.items()}

# Initialize variables to store named entities
named_entities = []
current_entity_tokens = []
current_label = None

# Iterate over tokens and predicted labels
for token_id, label_id in zip(tokenized_input["input_ids"][0], predicted_labels):
    token = tokenizer.decode([token_id])
    label = label_map_inverted[label_id.item()]

    # Skip 'O' labels (non-entity tokens)
    if label == "O":
        if current_entity_tokens:
            # Append the current entity and its label to the list
            named_entities.append((" ".join(current_entity_tokens).strip(), current_label))
            current_entity_tokens = []
            current_label = None
        continue

    # Handle subword tokens (tokens starting with '##')
    if token.startswith("##"):
        current_entity_tokens[-1] += token[2:]  # Append subword to the last token
    else:
        # If it's a new entity or different from the current one, append the previous entity first
        if not current_entity_tokens or label.split("-")[0] == "B" or label != current_label:
            if current_entity_tokens:
                named_entities.append((" ".join(current_entity_tokens).strip(), current_label))
            current_entity_tokens = [token]  # Start a new entity
        else:
            current_entity_tokens.append(token)  # Continue appending to the current entity

        current_label = label

# Append any remaining entity at the end
if current_entity_tokens:
    named_entities.append((" ".join(current_entity_tokens).strip(), current_label))

# Print results
print("Example 1:", sentence)
print("####")
print("Named Entities:")
for entity, label in named_entities:
    print(f"{entity}: {label}")

Example 1: Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California.
####
Named Entities:
Steve: B-ORG
Apple: B-ORG
Inc: I-ORG
San: B-LOC
Francisco: B-LOC
California: B-LOC
