In [1]:
# pip install torch --index-url https://download.pytorch.org/whl/cu124

In [2]:
# pip install transformers datasets seqeval

In [3]:
# pip install --upgrade jupyter

In [4]:
# pip install --upgrade ipywidgets

In [5]:
# pip install 'accelerate>=0.26.0'

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import numpy as np
from peft import LoraConfig, get_peft_model
from seqeval.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)

In [7]:
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [8]:
train_data = read_conll_file("eng.train")
validation_data = read_conll_file("eng.testa")
test_data = read_conll_file("eng.testb")

In [9]:
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)

In [10]:
label_list = sorted(
    list(set([token_data[3] for sentence in train_data for token_data in sentence]))
)
label_map = {label: i for i, label in enumerate(label_list)}

In [11]:
print(label_list)
print(label_map)

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [12]:
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [13]:
datasets = DatasetDict(
    {
        "train": train_dataset,
        "validation": validation_dataset,
        "test": test_dataset,
    }
)

In [14]:
# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [15]:
# Set random seeds for reproducibility
def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior
    torch.backends.cudnn.benchmark = False     # Disables auto-tuning for convolutional layers

In [16]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Set padding token to be the same as EOS token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Initialize model with updated tokenizer and move to GPU
model = AutoModelForTokenClassification.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    pad_token_id=tokenizer.eos_token_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    num_labels=len(label_list))

model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Disable causal mask for bidirectional attention
model.config.is_decoder = False

In [18]:
# Verify model device placement
print(f"Model is on device: {next(model.parameters()).device}")

# Optional: Enable CUDA optimizations
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

Model is on device: cuda:0


In [19]:
set_seed(42)

In [20]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)

    # Remove special tokens
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "accuracy": accuracy_score(true_predictions, true_labels),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }

In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens are ignored during training
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

In [23]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        labels, batch_first=True, padding_value=-100
    )

    return {
        "input_ids": input_ids,
        "labels": labels,
    }

In [24]:
# Check number of unique NER tags
unique_labels = set([label for example in datasets['train']["ner_tags"] for label in example])
print(f"Unique NER tags: {unique_labels}")
print(f"Model's num_labels: {model.config.num_labels}")

Unique NER tags: {0, 1, 2, 3, 4, 5, 6, 7, 8}
Model's num_labels: 9


In [25]:
# Check if any label is outside the valid range
for example in datasets['train']["ner_tags"]:
    for label in example:
        if label < 0 or label >= model.config.num_labels:
            raise ValueError(f"Invalid label found: {label}. Expected range: [0, {model.config.num_labels - 1}]")

In [26]:
# # Set the context window explicitly to 8192 tokens
# ctx_len = 8192
# tokenizer.model_max_length = ctx_len
# model.config.rope_freq_base = (ctx_len / 131_072) * 500_000
# print(model.config.rope_freq_base)

In [27]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=['q_proj', 'v_proj'],  # Only target attention layers
    bias="none",
    task_type="AutoModelForTokenClassification"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,703,936 || all params: 1,237,536,777 || trainable%: 0.1377


In [28]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    lr_scheduler_type="cosine",
    remove_unused_columns=False
)

In [29]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    args=training_args
)

In [30]:
trainer.train()

  0%|          | 0/9370 [00:00<?, ?it/s]

{'loss': 0.3602, 'grad_norm': 1.2666654586791992, 'learning_rate': 0.00019859811225790162, 'epoch': 0.53}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.4218, 'eval_samples_per_second': 188.147, 'eval_steps_per_second': 11.78, 'epoch': 1.0}
{'loss': 0.1508, 'grad_norm': 0.88332200050354, 'learning_rate': 0.00019443175481643533, 'epoch': 1.07}
{'loss': 0.1267, 'grad_norm': 0.6077597141265869, 'learning_rate': 0.00018761774298412903, 'epoch': 1.6}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.8502, 'eval_samples_per_second': 183.87, 'eval_steps_per_second': 11.512, 'epoch': 2.0}
{'loss': 0.1183, 'grad_norm': 0.815678596496582, 'learning_rate': 0.00017834712635422716, 'epoch': 2.13}
{'loss': 0.1023, 'grad_norm': 0.7046940326690674, 'learning_rate': 0.00016687983220303282, 'epoch': 2.67}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.7307, 'eval_samples_per_second': 185.044, 'eval_steps_per_second': 11.585, 'epoch': 3.0}
{'loss': 0.0945, 'grad_norm': 0.2917778789997101, 'learning_rate': 0.00015353737771265787, 'epoch': 3.2}
{'loss': 0.0878, 'grad_norm': 0.8045691251754761, 'learning_rate': 0.0001386938553510936, 'epoch': 3.74}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.7722, 'eval_samples_per_second': 184.634, 'eval_steps_per_second': 11.56, 'epoch': 4.0}
{'loss': 0.0802, 'grad_norm': 0.3310302793979645, 'learning_rate': 0.00012276544415930476, 'epoch': 4.27}
{'loss': 0.0751, 'grad_norm': 0.2692268490791321, 'learning_rate': 0.00010619874102530885, 'epoch': 4.8}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.8673, 'eval_samples_per_second': 183.704, 'eval_steps_per_second': 11.501, 'epoch': 5.0}
{'loss': 0.07, 'grad_norm': 0.542940080165863, 'learning_rate': 8.945823911011648e-05, 'epoch': 5.34}
{'loss': 0.0648, 'grad_norm': 0.36107969284057617, 'learning_rate': 7.301330450235733e-05, 'epoch': 5.87}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 19.4194, 'eval_samples_per_second': 178.482, 'eval_steps_per_second': 11.174, 'epoch': 6.0}
{'loss': 0.0586, 'grad_norm': 0.24991106986999512, 'learning_rate': 5.73250162469559e-05, 'epoch': 6.4}
{'loss': 0.057, 'grad_norm': 0.8931513428688049, 'learning_rate': 4.2833238723907275e-05, 'epoch': 6.94}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 19.3143, 'eval_samples_per_second': 179.453, 'eval_steps_per_second': 11.235, 'epoch': 7.0}
{'loss': 0.0523, 'grad_norm': 0.9182114005088806, 'learning_rate': 2.9944288838627054e-05, 'epoch': 7.47}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.9313, 'eval_samples_per_second': 183.083, 'eval_steps_per_second': 11.462, 'epoch': 8.0}
{'loss': 0.0513, 'grad_norm': 0.5373820066452026, 'learning_rate': 1.9019543808169115e-05, 'epoch': 8.0}
{'loss': 0.0477, 'grad_norm': 0.4995848834514618, 'learning_rate': 1.0365308955408459e-05, 'epoch': 8.54}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.8594, 'eval_samples_per_second': 183.781, 'eval_steps_per_second': 11.506, 'epoch': 9.0}
{'loss': 0.0486, 'grad_norm': 0.6393904685974121, 'learning_rate': 4.224229595491591e-06, 'epoch': 9.07}
{'loss': 0.0459, 'grad_norm': 0.801257848739624, 'learning_rate': 7.684878059769363e-07, 'epoch': 9.61}


  0%|          | 0/217 [00:00<?, ?it/s]

{'eval_runtime': 18.913, 'eval_samples_per_second': 183.26, 'eval_steps_per_second': 11.474, 'epoch': 10.0}
{'train_runtime': 2551.1713, 'train_samples_per_second': 58.746, 'train_steps_per_second': 3.673, 'train_loss': 0.09209455878910158, 'epoch': 10.0}


TrainOutput(global_step=9370, training_loss=0.09209455878910158, metrics={'train_runtime': 2551.1713, 'train_samples_per_second': 58.746, 'train_steps_per_second': 3.673, 'total_flos': 1.2430007246201301e+17, 'train_loss': 0.09209455878910158, 'epoch': 10.0})

In [33]:
set_seed(42)

sentence = "Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California."
tokenized_input = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).to(model.device)
outputs = model(**tokenized_input)
predicted_labels = outputs.logits.argmax(-1)[0]

label_map_inverted = {v: k for k, v in label_map.items()}

named_entities = [
    (tokenizer.decode([token]), label_map_inverted[label.item()])
    for token, label in zip(tokenized_input["input_ids"][0], predicted_labels)
    if label != 0 and label != label_map["O"]
]

print("Example 1: " + sentence)
print("####")
print("Named Entities:")
for entity, label in named_entities:
    print(f"{entity}: {label}")

Example 1: Steve Jobs, the co-founder of Apple Inc., was born in San Francisco, California.
####
Named Entities:
Steve: I-MISC
 Jobs: B-PER
 Apple: B-ORG
 Inc: I-ORG
 Francisco: I-LOC
