In [1]:
import ast
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)

from seqeval.metrics import classification_report, f1_score

In [2]:
df = pd.read_csv("ner.csv")
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [3]:
df["Tag"] = df["Tag"].apply(ast.literal_eval)
df["POS"] = df["POS"].apply(ast.literal_eval)

In [4]:
tokens_clean = []
tags_clean = []

for sentence, tags in zip(df["Sentence"], df["Tag"]):
    tokens = sentence.split()
    if len(tokens) == len(tags):
        tokens_clean.append(tokens)
        tags_clean.append(tags)

df = pd.DataFrame({
    "Tokens": tokens_clean,
    "Tags": tags_clean
})

In [5]:
unique_labels = sorted(set(label for seq in df["Tags"] for label in seq))

label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_labels)

In [6]:
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [7]:
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
def tokenize_and_align(tokens, labels):
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=64
    )

    word_ids = encoding.word_ids()
    aligned_labels = []

    prev_word = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)
        elif word_id != prev_word:
            aligned_labels.append(label2id[labels[word_id]])
        else:
            aligned_labels.append(-100)
        prev_word = word_id

    encoding["labels"] = aligned_labels
    return encoding

In [9]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = tokenize_and_align(row["Tokens"], row["Tags"])

        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "labels": torch.tensor(enc["labels"])
        }

In [10]:
train_dataset = NERDataset(train_df)
val_dataset = NERDataset(val_df)

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=2)

    true_preds, true_labels = [], []

    for p, l in zip(preds, labels):
        tp, tl = [], []
        for pi, li in zip(p, l):
            if li != -100:
                tp.append(id2label[pi])
                tl.append(id2label[li])
        true_preds.append(tp)
        true_labels.append(tl)

    return {"f1": f1_score(true_labels, true_preds)}

In [13]:
training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none"
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [15]:
trainer.train()



Step,Training Loss
50,0.9371
100,0.3134
150,0.2166
200,0.2056
250,0.1613
300,0.1702
350,0.17
400,0.1486
450,0.1479
500,0.1608


TrainOutput(global_step=4796, training_loss=0.1259820195811306, metrics={'train_runtime': 12759.8431, 'train_samples_per_second': 3.007, 'train_steps_per_second': 0.376, 'total_flos': 626716872239616.0, 'train_loss': 0.1259820195811306, 'epoch': 1.0})

In [16]:
preds, labels, _ = trainer.predict(val_dataset)
preds = np.argmax(preds, axis=2)

true_preds, true_labels = [], []

for p, l in zip(preds, labels):
    tp, tl = [], []
    for pi, li in zip(p, l):
        if li != -100:
            tp.append(id2label[pi])
            tl.append(id2label[li])
    true_preds.append(tp)
    true_labels.append(tl)

print(classification_report(true_labels, true_preds))



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        92
         eve       0.53      0.19      0.28        52
         geo       0.85      0.90      0.88      7547
         gpe       0.95      0.94      0.94      3187
         nat       0.50      0.09      0.15        44
         org       0.70      0.68      0.69      3888
         per       0.75      0.78      0.77      3345
         tim       0.87      0.87      0.87      4029

   micro avg       0.83      0.84      0.83     22184
   macro avg       0.64      0.56      0.57     22184
weighted avg       0.82      0.84      0.83     22184



In [17]:
def predict_sentence(sentence):
    tokens = sentence.split()
    enc = tokenize_and_align(tokens, ["O"] * len(tokens))
    input_ids = torch.tensor([enc["input_ids"]])

    with torch.no_grad():
        outputs = model(input_ids)

    preds = torch.argmax(outputs.logits, dim=2)[0].tolist()
    return list(zip(tokens, [id2label[p] for p in preds[:len(tokens)]]))

predict_sentence("John works at Google in New York")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[('John', 'O'),
 ('works', 'B-per'),
 ('at', 'O'),
 ('Google', 'O'),
 ('in', 'B-org'),
 ('New', 'O'),
 ('York', 'B-geo')]