# Install Dependencies

In [35]:
%pip install torch transformers datasets seqeval scikit-learn pandas numpy protobuf sentencepiece accelerate -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Import Libraries

In [36]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Config

In [37]:
DATA_PATH = "train_ner.csv"
MODEL_NAME = "roberta-large"
MAX_LENGTH = 256
RANDOM_SEED = 42
OUTPUT_DIR = "roberta_ner_model"
SAVED_MODEL_DIR = "../../model/NER/roberta_ner_saved"

In [38]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(RANDOM_SEED)

# Load and Prepare Data

In [39]:
df = pd.read_csv(DATA_PATH)
print(f"Total samples: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Total samples: 1094
Columns: ['id', 'tokens', 'ner_tags']


Unnamed: 0,id,tokens,ner_tags
0,neg_0,Story of a man who has unnatural feelings for ...,O O O O O O O O O O O O O O O O O O O O O O O ...
1,neg_1,Robert DeNiro plays the most unbelievably inte...,B-ACTOR I-ACTOR O O O O O O O O O O O O O O O ...
2,neg_2,"I saw the capsule comment said ""great acting.""...",O O O O O O O O O O O O O O O O O O O O O O O ...
3,neg_3,If I had not read Pat Barker's 'Union Street' ...,O O O O O O O B-MOVIE I-MOVIE O O O O O O O O ...
4,neg_4,This fanciful horror flick has Vincent Price p...,O O B-GENRE O O B-ACTOR I-ACTOR O O O O O O O ...


In [40]:
# Extract unique labels from all tags
all_tags = []
for tags in df['ner_tags']:
    all_tags.extend(tags.split())

unique_labels = sorted(list(set(all_tags)))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Number of unique labels: {len(unique_labels)}")
print(f"Labels: {unique_labels}")

Number of unique labels: 11
Labels: ['B-ACTOR', 'B-CHARACTER', 'B-DIRECTOR', 'B-GENRE', 'B-MOVIE', 'I-ACTOR', 'I-CHARACTER', 'I-DIRECTOR', 'I-GENRE', 'I-MOVIE', 'O']


In [41]:
# Convert data to list format
def prepare_data(df):
    data = []
    for _, row in df.iterrows():
        tokens = row['tokens'].split()
        tags = row['ner_tags'].split()
        data.append({
            'id': row['id'],
            'tokens': tokens,
            'ner_tags': [label2id[tag] for tag in tags]
        })
    return data

data = prepare_data(df)
print(f"Prepared {len(data)} samples")
print(f"\nExample:")
print(f"Tokens: {data[0]['tokens'][:10]}")
print(f"Tags: {data[0]['ner_tags'][:10]}")

Prepared 1094 samples

Example:
Tokens: ['Story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a']
Tags: [10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


In [42]:
# Split data into train and validation sets
train_data, val_data = train_test_split(
    data,
    test_size=0.15,
    random_state=RANDOM_SEED
)

print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")

Train samples: 929
Val samples: 165


In [43]:
# Create HuggingFace datasets
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 929
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 165
    })
})


# Tokenization

In [44]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
print(f"Tokenizer loaded: {MODEL_NAME}")

Tokenizer loaded: roberta-large


In [45]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens get -100 (ignored in loss)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First subword of a word gets the label
                label_ids.append(label[word_idx])
            else:
                # Subsequent subwords get -100 (ignored in loss)
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [46]:
encoded_ds = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print(encoded_ds)

Map:   0%|          | 0/929 [00:00<?, ? examples/s]

Map: 100%|██████████| 929/929 [00:00<00:00, 1074.15 examples/s]
Map:   0%|          | 0/165 [00:00<?, ? examples/s]
Map: 100%|██████████| 165/165 [00:00<00:00, 1144.35 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 929
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 165
    })
})





# Model Setup

In [47]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Number of labels: {len(unique_labels)}")
print(f"Model parameters: {model.num_parameters():,}")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: roberta-large
Number of labels: 11
Model parameters: 354,321,419


In [48]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Metrics

In [49]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert to label names and filter out -100
    true_labels = []
    true_predictions = []

    for prediction, label in zip(predictions, labels):
        true_label = []
        true_prediction = []
        for pred, lab in zip(prediction, label):
            if lab != -100:  # Ignore special tokens
                true_label.append(id2label[lab])
                true_prediction.append(id2label[pred])
        true_labels.append(true_label)
        true_predictions.append(true_prediction)

    # Calculate metrics using seqeval
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training Configuration

In [50]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,

    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,

    fp16=True,
    gradient_accumulation_steps=2,
    report_to="none",
    seed=RANDOM_SEED,
)

# Train Model

In [51]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.7127,0.048581,0.710602,0.72198,0.716245
2,0.0423,0.031478,0.823022,0.832606,0.827786
3,0.0254,0.02845,0.837681,0.841339,0.839506
4,0.0172,0.029545,0.853767,0.841339,0.847507
5,0.0123,0.029171,0.844118,0.835517,0.839795


TrainOutput(global_step=295, training_loss=0.13889382825059407, metrics={'train_runtime': 166.2283, 'train_samples_per_second': 27.943, 'train_steps_per_second': 1.775, 'total_flos': 2156987807009280.0, 'train_loss': 0.13889382825059407, 'epoch': 5.0})

# Evaluation

In [52]:
print("===== Evaluation on validation set =====")
val_metrics = trainer.evaluate(encoded_ds["validation"])
for k, v in val_metrics.items():
    print(f"{k}: {v:.4f}")

===== Evaluation on validation set =====


eval_loss: 0.0295
eval_precision: 0.8538
eval_recall: 0.8413
eval_f1: 0.8475
eval_runtime: 1.0132
eval_samples_per_second: 162.8470
eval_steps_per_second: 10.8560
epoch: 5.0000


In [53]:
# Get detailed classification report
predictions, labels, _ = trainer.predict(encoded_ds["validation"])
predictions = np.argmax(predictions, axis=2)

# Convert to label names
true_labels = []
true_predictions = []

for prediction, label in zip(predictions, labels):
    true_label = []
    true_prediction = []
    for pred, lab in zip(prediction, label):
        if lab != -100:
            true_label.append(id2label[lab])
            true_prediction.append(id2label[pred])
    true_labels.append(true_label)
    true_predictions.append(true_prediction)

print("\n===== Detailed Classification Report =====")
print(classification_report(true_labels, true_predictions))


===== Detailed Classification Report =====
              precision    recall  f1-score   support

       ACTOR       0.91      0.90      0.91       259
   CHARACTER       0.81      0.82      0.82       150
    DIRECTOR       0.84      0.82      0.83        57
       GENRE       0.44      0.42      0.43        36
       MOVIE       0.88      0.86      0.87       185

   micro avg       0.85      0.84      0.85       687
   macro avg       0.78      0.77      0.77       687
weighted avg       0.85      0.84      0.85       687

              precision    recall  f1-score   support

       ACTOR       0.91      0.90      0.91       259
   CHARACTER       0.81      0.82      0.82       150
    DIRECTOR       0.84      0.82      0.83        57
       GENRE       0.44      0.42      0.43        36
       MOVIE       0.88      0.86      0.87       185

   micro avg       0.85      0.84      0.85       687
   macro avg       0.78      0.77      0.77       687
weighted avg       0.85      0.84

# Save Model

In [54]:
trainer.model.save_pretrained(SAVED_MODEL_DIR)
tokenizer.save_pretrained(SAVED_MODEL_DIR)
print(f"Model saved to: {SAVED_MODEL_DIR}")

Model saved to: ../../model/NER/roberta_ner_saved


# Test Predictions

In [55]:
# Load the saved model for inference
from transformers import pipeline

ner_pipeline = pipeline(
    "token-classification",
    model=SAVED_MODEL_DIR,
    tokenizer=SAVED_MODEL_DIR,
    aggregation_strategy="simple"
)

# Test examples
test_texts = [
    "Christopher Nolan directed Inception starring Leonardo DiCaprio.",
    "The Dark Knight is one of the best action movies ever made.",
    "Tom Hanks played the role of Forrest Gump brilliantly.",
    "Martin Scorsese's Goodfellas is a masterpiece of the crime genre.",
]

print("===== Test Predictions =====")
for text in test_texts:
    print(f"\nText: {text}")
    results = ner_pipeline(text)
    if results:
        print("Entities:")
        for result in results:
            print(f"  {result['word']:20s} -> {result['entity_group']:12s} (score: {result['score']:.3f})")
    else:
        print("  No entities detected")

Device set to use cuda:0


===== Test Predictions =====

Text: Christopher Nolan directed Inception starring Leonardo DiCaprio.
Entities:
   Christopher Nolan   -> DIRECTOR     (score: 0.914)
   Inception           -> MOVIE        (score: 0.992)
   Leonardo DiCaprio   -> ACTOR        (score: 0.985)

Text: The Dark Knight is one of the best action movies ever made.
Entities:
   The Dark Knight     -> MOVIE        (score: 0.975)

Text: Tom Hanks played the role of Forrest Gump brilliantly.
Entities:
   Tom Hanks           -> ACTOR        (score: 0.982)
   Forrest Gump        -> CHARACTER    (score: 0.918)

Text: Martin Scorsese's Goodfellas is a masterpiece of the crime genre.
Entities:
   Martin Scorsese     -> DIRECTOR     (score: 0.973)
   Goodfellas          -> MOVIE        (score: 0.995)
   crime               -> GENRE        (score: 0.593)


# Test Set Evaluation

In [56]:
# Load test data
TEST_DATA_PATH = "test_ner.csv"
test_df = pd.read_csv(TEST_DATA_PATH)
print(f"Total test samples: {len(test_df)}")
test_df.head()

Total test samples: 45


Unnamed: 0,id,tokens,ner_tags
0,pos_601,I was surprised at the low rating this film go...,O O O O O O O O O O O O O O O O O O O O O O O ...
1,pos_602,"I have never danced flamenco before, but someh...",O O O O O O O O O O O O O O O O O O O O O O O ...
2,pos_603,The music of Albeniz pervades this film. Once ...,O O O O O O O O O O O O O O O O O O O O O O O ...
3,pos_604,"Saturday June 3, 6:30pm The Neptune Monday Jun...",O O O O O O O O O O O O O O O O O O O O O O O ...
4,pos_605,If Saura hadn't done anything like this before...,O B-DIRECTOR O O O O O O B-MOVIE O O O O O O O...


In [57]:
# Prepare test data with the same function used for training
test_data = prepare_data(test_df)
print(f"Prepared {len(test_data)} test samples")

# Create test dataset
test_ds = Dataset.from_list(test_data)
print(test_ds)

Prepared 45 test samples
Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 45
})


In [58]:
# Tokenize and align labels for test data
encoded_test_ds = test_ds.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_ds.column_names
)

print(encoded_test_ds)

Map: 100%|██████████| 45/45 [00:00<00:00, 402.66 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 45
})





In [59]:
# Evaluate on test set
print("===== Evaluation on test set =====")
test_metrics = trainer.evaluate(encoded_test_ds)
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")

===== Evaluation on test set =====


eval_loss: 0.0364
eval_precision: 0.8848
eval_recall: 0.8284
eval_f1: 0.8557
eval_runtime: 0.2929
eval_samples_per_second: 153.6170
eval_steps_per_second: 10.2410
epoch: 5.0000


In [60]:
# Get detailed classification report for test set
test_predictions, test_labels, _ = trainer.predict(encoded_test_ds)
test_predictions = np.argmax(test_predictions, axis=2)

# Convert to label names
test_true_labels = []
test_true_predictions = []

for prediction, label in zip(test_predictions, test_labels):
    true_label = []
    true_prediction = []
    for pred, lab in zip(prediction, label):
        if lab != -100:
            true_label.append(id2label[lab])
            true_prediction.append(id2label[pred])
    test_true_labels.append(true_label)
    test_true_predictions.append(true_prediction)

print("\n===== Detailed Classification Report (Test Set) =====")
print(classification_report(test_true_labels, test_true_predictions))


===== Detailed Classification Report (Test Set) =====
              precision    recall  f1-score   support

       ACTOR       0.98      1.00      0.99        61
   CHARACTER       0.81      0.79      0.80        38
    DIRECTOR       0.77      0.91      0.83        11
       GENRE       0.64      0.33      0.44        27
       MOVIE       0.91      0.88      0.89        67

   micro avg       0.88      0.83      0.86       204
   macro avg       0.82      0.78      0.79       204
weighted avg       0.87      0.83      0.84       204

