In [1]:
# pip install torch --index-url https://download.pytorch.org/whl/cu124

In [2]:
# pip install -r requirements.txt

In [3]:
# Import packages
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from transformers.models.llama.modeling_llama import LlamaModel
from peft import LoraConfig, get_peft_model
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from seqeval.scheme import IOB2

In [4]:
def read_conll_file(file_path):
    # Open the file located at 'file_path' in read mode
    with open(file_path, "r") as f:
        # Read the entire file content and remove any leading/trailing whitespace
        content = f.read().strip() 
        # Split the content into sentences, where each sentence is separated by a blank line ("\n\n")
        sentences = content.split("\n\n")   
        # Initialize an empty list to store the parsed data for each sentence
        data = []   
        # Iterate over each sentence in the list of sentences
        for sentence in sentences:
            # Split each sentence into individual tokens (each token is on a new line)
            tokens = sentence.split("\n")    
            # Initialize a list to store the data for each token in the current sentence
            token_data = [] 
            # Iterate over each token in the sentence
            for token in tokens:
                # Split the token into its components (e.g., word, POS tag, etc.)
                token_data.append(token.split())     
            # Append the list of token data for the current sentence to the overall data list
            data.append(token_data) 
    # Return the parsed data as a list of sentences, where each sentence is a list of tokens,
    # and each token is represented as a list of its components (e.g., word, POS tag, etc.)
    return data

In [5]:
# Read data
train_data = read_conll_file("eng.train")
validation_data = read_conll_file("eng.testa")
test_data = read_conll_file("eng.testb")

In [6]:
# Show example
print(train_data[:2])

[[['-DOCSTART-', '-X-', '-X-', 'O']], [['EU', 'NNP', 'B-NP', 'B-ORG'], ['rejects', 'VBZ', 'B-VP', 'O'], ['German', 'JJ', 'B-NP', 'B-MISC'], ['call', 'NN', 'I-NP', 'O'], ['to', 'TO', 'B-VP', 'O'], ['boycott', 'VB', 'I-VP', 'O'], ['British', 'JJ', 'B-NP', 'B-MISC'], ['lamb', 'NN', 'I-NP', 'O'], ['.', '.', 'O', 'O']]]


In [7]:
def convert_to_dataset(data, label_map):
    # Initialize a dictionary to store formatted tokens and NER tags
    formatted_data = {"tokens": [], "ner_tags": []}   
    # Iterate over each sentence in the input data
    for sentence in data:
        # Extract the tokens (first element of each token_data tuple) from the sentence
        tokens = [token_data[0] for token_data in sentence]
        # Convert NER tags (fourth element of each token_data tuple) using label_map
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        # Append the extracted tokens and corresponding NER tags to the formatted_data dictionary
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    
    # Convert the formatted data into a Dataset object and return it
    return Dataset.from_dict(formatted_data)

In [8]:
# Extract unique labels from the training data and sort them alphabetically
label_list = sorted(
    list(set([token_data[3] for sentence in train_data for token_data in sentence]))
)

# Create a mapping from each label to a unique integer index
# This dictionary will map each label (from label_list) to its corresponding index
label_map = {label: i for i, label in enumerate(label_list)}

In [9]:
# Print the list of labels
print(label_list)

# Print the mapping of labels to their corresponding indices or values
print(label_map)

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [10]:
# Create a dictionary `id2label` by reversing the key-value pairs in `label_map`
id2label = {value: key for key, value in label_map.items()}
print(id2label)

{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}


In [11]:
# Convert the training data into a dataset format using a label map for mapping labels
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [12]:
# Create a DatasetDict object to hold multiple datasets
datasets = DatasetDict(
    {
        "train": train_dataset,
        "validation": validation_dataset,
        "test": test_dataset,
    }
)

In [13]:
# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [14]:
# Set random seeds for reproducibility
def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior
    torch.backends.cudnn.benchmark = False     # Disables auto-tuning for convolutional layers

In [15]:
set_seed(42)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Create custom LlamaModel with bidirectional attention
class LlamaBidirectionalModel(LlamaModel):
    def _update_causal_mask(self, attention_mask):
        # Create bidirectional attention mask (all ones)
        bsz, seq_len = attention_mask.shape
        mask = torch.ones((bsz, 1, seq_len, seq_len), dtype=torch.bool, device=attention_mask.device)
        return mask

# Initialize model with bidirectional attention
model = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    pad_token_id=tokenizer.eos_token_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    num_labels=len(label_list),
    id2label = id2label
)

# Replace the base model with bidirectional version
model.base_model = LlamaBidirectionalModel(model.config)
model.config.is_decoder = False # already bidirectional, but setting in any case

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Verify model device placement
print(f"Model is on device: {next(model.parameters()).device}")

# Optional: Enable CUDA optimizations
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

Model is on device: cuda:0


In [17]:
def calculate_metrics(pred):
    # Get predictions and labels
    predictions = pred.predictions
    labels = pred.label_ids
    
    # Get the predicted class indices (argmax along the last dimension)
    predictions = np.argmax(predictions, axis=-1)
    
    # Initialize lists to store true and predicted labels
    true_labels = []
    pred_labels = []
    
    # Convert numerical predictions and labels back to BIO tags
    for prediction, label in zip(predictions, labels):
        # Filter out padding (-100)
        pred_list = []
        true_list = []
        
        for p, l in zip(prediction, label):
            if l != -100:  # Ignore padding tokens
                pred_list.append(id2label[p])
                true_list.append(id2label[l])
        
        true_labels.append(true_list)
        pred_labels.append(pred_list)
    
    # Calculate metrics using seqeval
    results = {
        'precision': precision_score(true_labels, pred_labels, scheme=IOB2),
        'recall': recall_score(true_labels, pred_labels, scheme=IOB2),
        'f1': f1_score(true_labels, pred_labels, scheme=IOB2),
    }
    
    # Add detailed classification report to results
    report = classification_report(true_labels, pred_labels, scheme=IOB2, output_dict=True)
    
    # Add per-entity metrics
    for entity in report.keys():
        if entity not in ['macro avg', 'micro avg', 'weighted avg']:
            results[f'{entity}_f1'] = report[entity]['f1-score']
    
    return results

In [18]:
def tokenize_and_align_labels(examples):
    # Tokenize the input examples. The 'truncation' parameter ensures that sequences are truncated to fit the model's input size.
    # 'is_split_into_words=True' indicates that the input is already split into words, not sentences.
    # 'padding=True' ensures that all sequences in the batch have the same length.
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)
    
    labels = []  # Initialize a list to store the aligned label IDs for each example.
    
    # Iterate over each set of labels in the examples
    for i, label in enumerate(examples["ner_tags"]):
        # Get the word IDs for the current example. This helps map tokens back to their original word indices.
        word_ids = tokenized_inputs.word_ids(batch_index=i)   
        previous_word_idx = None  # Track the previous word index to handle subword tokens.
        label_ids = []  # Initialize a list to store label IDs for the current example.  
        # Iterate over each word index in the tokenized input
        for word_idx in word_ids:
            if word_idx is None:
                # If the word index is None, it corresponds to a special token (e.g., [CLS], [SEP]).
                # These tokens are ignored during training by assigning a label of -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # If the current word index is different from the previous one, it's a new word.
                # Assign the corresponding label to this token.
                label_ids.append(label[word_idx])
            else:
                # If the current word index is the same as the previous one, it's a subword token.
                # Assign -100 to ignore it during training.
                label_ids.append(-100)            
            previous_word_idx = word_idx  # Update the previous word index for the next iteration.        
        labels.append(label_ids)  # Add the processed label IDs for this example to the labels list.    
    
    tokenized_inputs["labels"] = labels  # Add the aligned labels to the tokenized inputs.    
    
    return tokenized_inputs  # Return the tokenized inputs with aligned labels.

In [19]:
# # Apply the 'tokenize_and_align_labels' function to each example in the dataset using the map() method.
# # - batched=True: This enables processing multiple examples at once (in batches) instead of one by one.
# #   This speeds up the tokenization process, as tokenization libraries like Hugging Face's Tokenizers
# #   can parallelize operations more efficiently when working with batches.
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

In [20]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [21]:
# Check number of unique NER tags
unique_labels = set([label for example in datasets['train']["ner_tags"] for label in example])
print(f"Unique NER tags: {unique_labels}")
print(f"Model's number of labels: {model.config.num_labels}")

Unique NER tags: {0, 1, 2, 3, 4, 5, 6, 7, 8}
Model's number of labels: 9


In [22]:
print(label_map)

{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [23]:
print(id2label)

{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}


In [24]:
# Check if any label is outside the valid range
for example in datasets['train']["ner_tags"]:
    for label in example:
        if label < 0 or label >= model.config.num_labels:
            raise ValueError(f"Invalid label found: {label}. Expected range: [0, {model.config.num_labels - 1}]")

In [25]:
# # Set the context window explicitly to 8192 tokens
# ctx_len = 8192
# tokenizer.model_max_length = ctx_len
# model.config.rope_freq_base = (ctx_len / 131_072) * 500_000
# print(model.config.rope_freq_base)

In [26]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=['q_proj', 'v_proj'],  # Only target attention layers
    bias="none",
    task_type="AutoModelForTokenClassification"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,703,936 || all params: 2,473,351,177 || trainable%: 0.0689


In [27]:
# Define training arguments for the model
training_args = TrainingArguments(
    output_dir="./results",  # Directory where the model checkpoints and logs will be saved
    eval_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-4,  # Learning rate for the optimizer
    per_device_train_batch_size=16,  # Batch size for training on each device (e.g., GPU)
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device (e.g., GPU)
    num_train_epochs=5,  # Number of epochs to train the model
    lr_scheduler_type="cosine",  # Learning rate scheduler type (cosine annealing in this case)
    remove_unused_columns=False,  # Keep all columns in the dataset, even if they are not used by the model
    seed=42 # For reproducibility
)

In [28]:
# Initialize the Trainer class for model training and evaluation
trainer = Trainer(
    model=model,  # The model to be trained
    train_dataset=tokenized_datasets["train"],  # The tokenized training dataset
    eval_dataset=tokenized_datasets["validation"],  # The tokenized validation dataset for evaluation during training
    data_collator=data_collator,  # A function or object that batches and pads the data
    processing_class=tokenizer,  # The tokenizer used for processing the input text
    compute_metrics=calculate_metrics,  # A function to compute metrics during evaluation (e.g., accuracy, F1 score)
    args=training_args  # Training arguments like batch size, number of epochs, learning rate, etc.
)

In [29]:
# Train the model
trainer.train()

  0%|          | 0/4685 [00:00<?, ?it/s]

{'loss': 0.3612, 'grad_norm': 1.4068773984909058, 'learning_rate': 0.00019443175481643533, 'epoch': 0.53}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 38.2463, 'eval_samples_per_second': 90.623, 'eval_steps_per_second': 5.674, 'epoch': 1.0}
{'loss': 0.1466, 'grad_norm': 1.2720367908477783, 'learning_rate': 0.00017834712635422716, 'epoch': 1.07}
{'loss': 0.1244, 'grad_norm': 0.7884764671325684, 'learning_rate': 0.00015353737771265787, 'epoch': 1.6}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 38.9987, 'eval_samples_per_second': 88.875, 'eval_steps_per_second': 5.564, 'epoch': 2.0}
{'loss': 0.1114, 'grad_norm': 1.187119722366333, 'learning_rate': 0.00012276544415930476, 'epoch': 2.13}
{'loss': 0.097, 'grad_norm': 0.6015186309814453, 'learning_rate': 8.945823911011648e-05, 'epoch': 2.67}




  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 48.8256, 'eval_samples_per_second': 70.987, 'eval_steps_per_second': 4.444, 'epoch': 3.0}
{'loss': 0.088, 'grad_norm': 0.3702273964881897, 'learning_rate': 5.73250162469559e-05, 'epoch': 3.2}
{'loss': 0.0813, 'grad_norm': 0.819275975227356, 'learning_rate': 2.9944288838627054e-05, 'epoch': 3.74}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 41.4312, 'eval_samples_per_second': 83.657, 'eval_steps_per_second': 5.238, 'epoch': 4.0}
{'loss': 0.0761, 'grad_norm': 0.4420791566371918, 'learning_rate': 1.0365308955408459e-05, 'epoch': 4.27}
{'loss': 0.0742, 'grad_norm': 0.4820078909397125, 'learning_rate': 7.684878059769363e-07, 'epoch': 4.8}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 41.0916, 'eval_samples_per_second': 84.348, 'eval_steps_per_second': 5.281, 'epoch': 5.0}
{'train_runtime': 5396.6881, 'train_samples_per_second': 13.885, 'train_steps_per_second': 0.868, 'train_loss': 0.12663855578118224, 'epoch': 5.0}


TrainOutput(global_step=4685, training_loss=0.12663855578118224, metrics={'train_runtime': 5396.6881, 'train_samples_per_second': 13.885, 'train_steps_per_second': 0.868, 'total_flos': 1.2438176472158851e+17, 'train_loss': 0.12663855578118224, 'epoch': 5.0})

In [30]:
trainer.evaluate()

  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 41.894,
 'eval_samples_per_second': 82.733,
 'eval_steps_per_second': 5.18,
 'epoch': 5.0}

In [32]:
# Save the model
trainer.save_model('./bidirectional_llama32')

In [33]:
# Extract the tokenized test data from the preprocessed dataset
test_dataset_tokenized = tokenized_datasets["test"]

In [34]:
# Use the trained model to perform predictions on the tokenized test dataset
# 'predictions' will contain the model's output, 'labels' will contain the true labels,
# and 'metrics' will store evaluation metrics (e.g., accuracy, precision, recall, etc.)
predictions, labels, metrics = trainer.predict(test_dataset_tokenized)

  0%|          | 0/231 [00:00<?, ?it/s]

In [35]:
# Display the evaluation metrics to assess the model's performance on the test set
metrics

{'test_runtime': 70.6465,
 'test_samples_per_second': 52.147,
 'test_steps_per_second': 3.27}

In [36]:
set_seed(42)

# Input sentence
sentence = "Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California."

# Tokenize without adding special tokens
tokenized_input = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).to(model.device)

# Get model outputs
outputs = model(**tokenized_input)

# Get predicted labels (argmax over logits)
predicted_labels = outputs.logits.argmax(-1)[0]

# Inverted label map (assuming label_map is defined elsewhere)
label_map_inverted = {v: k for k, v in label_map.items()}

# Initialize variables to store named entities
named_entities = []
current_entity_tokens = []
current_label = None

# Iterate over tokens and predicted labels
for token_id, label_id in zip(tokenized_input["input_ids"][0], predicted_labels):
    token = tokenizer.decode([token_id])
    label = label_map_inverted[label_id.item()]

    # Skip 'O' labels (non-entity tokens)
    if label == "O":
        if current_entity_tokens:
            # Append the current entity and its label to the list
            named_entities.append((" ".join(current_entity_tokens).strip(), current_label))
            current_entity_tokens = []
            current_label = None
        continue

    # Handle subword tokens (tokens starting with '##')
    if token.startswith("##"):
        current_entity_tokens[-1] += token[2:]  # Append subword to the last token
    else:
        # If it's a new entity or different from the current one, append the previous entity first
        if not current_entity_tokens or label.split("-")[0] == "B" or label != current_label:
            if current_entity_tokens:
                named_entities.append((" ".join(current_entity_tokens).strip(), current_label))
            current_entity_tokens = [token]  # Start a new entity
        else:
            current_entity_tokens.append(token)  # Continue appending to the current entity

        current_label = label

# Append any remaining entity at the end
if current_entity_tokens:
    named_entities.append((" ".join(current_entity_tokens).strip(), current_label))

# Print results
print("Example 1:", sentence)
print("####")
print("Named Entities:")
for entity, label in named_entities:
    print(f"{entity}: {label}")

Example 1: Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California.
####
Named Entities:
Steve: B-ORG
Jobs: I-PER
Apple: B-ORG
Inc: I-ORG
San: B-LOC
Francisco: B-LOC
California: B-LOC
