In [1]:
pip install datasets transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import required libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (BertTokenizerFast, AutoModelForTokenClassification, AutoTokenizer,
                          TrainingArguments, Trainer, pipeline, DataCollatorForTokenClassification)
import torch
import json
import pandas as pd
import os

2024-08-27 09:40:22.026875: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-27 09:40:22.026977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-27 09:40:22.151632: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Optional: Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# load ner data file. And split text and labels
def load_ner_datasets(file_path):
    """
    Load and parse the NER dataset from the specified file.
    """
    sentences = []
    sentence = {"text": [], 'tags': []}
    
    try:
        with open(file_path, 'r') as f:
            for line in f:
                if line.strip() == "":
                    if sentence['text']:
                        sentences.append(sentence)
                    sentence = {"text": [], 'tags': []}
                else:
                    token, label = line.split(":-")
                    sentence['text'].append(token)
                    sentence['tags'].append(label.upper())
        return sentences
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")
        return []
    
file_path = '/kaggle/input/ner-v7/ner_dataset.txt'
ner_sentences = load_ner_datasets(file_path)
print(f"Loaded {len(ner_sentences)} sentences from the dataset.")


Loaded 200000 sentences from the dataset.


In [5]:
ner_sentences[0]

{'text': ['07-Aug-24',
  '00:38:11',
  'credited to',
  'Allahabad Bank',
  '3352.85',
  '71647.15',
  'XX7317',
  'Rs',
  'on',
  'at',
  'Flipkart.',
  'Avl',
  'Lmt:',
  'Contact:',
  '+917088412672'],
 'tags': [' DATE\n',
  ' TIME\n',
  ' TRANSACTION_TYPE\n',
  ' BANK\n',
  ' MONEY\n',
  ' MONEY\n',
  ' ACCOUNT_NUMBER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n',
  ' OTHER\n']}

In [6]:
hf_dataset = Dataset.from_pandas(pd.DataFrame(ner_sentences))
print(f"Dataset contains {hf_dataset.num_rows} rows.")

# Split the dataset into training and test sets
train_test_split_result = hf_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split_result['train']
test_dataset = train_test_split_result['test']

print(f"Training set size: {train_dataset.num_rows}")
print(f"Test set size: {test_dataset.num_rows}")

Dataset contains 200000 rows.
Training set size: 180000
Test set size: 20000


In [7]:
tokenizer_name = 'bert-base-cased'
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name)
print(f"Tokenizer {tokenizer_name} loaded.")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizer bert-base-cased loaded.


In [8]:
train_dataset['text'][0]

['18-Aug-24',
 '18:43:53',
 'Spent on',
 'Punjab & Sind Bank',
 '1576.77',
 '8423.23',
 'XX4622',
 '₹',
 'at',
 'MakeMyTrip.',
 'Avl',
 'Lmt:']

In [9]:
# Test tokenizer
inputs = tokenizer(train_dataset["text"][0],  truncation=True, is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '18',
 '-',
 'Aug',
 '-',
 '24',
 '18',
 ':',
 '43',
 ':',
 '53',
 'S',
 '##pent',
 'on',
 'Punjab',
 '&',
 'Sin',
 '##d',
 'Bank',
 '157',
 '##6',
 '.',
 '77',
 '84',
 '##23',
 '.',
 '23',
 'X',
 '##X',
 '##46',
 '##22',
 '₹',
 'at',
 'Make',
 '##M',
 '##y',
 '##T',
 '##rip',
 '.',
 'A',
 '##v',
 '##l',
 'L',
 '##m',
 '##t',
 ':',
 '[SEP]']

In [10]:
word_ids = inputs.word_ids()

print(word_ids)

[None, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, None]


In [11]:
unique_labels = set(label for sentence in hf_dataset["tags"] for label in sentence)
label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id2label = {idx: label for label, idx in label2id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"], 
#         padding=True, 
        truncation=True, 
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                # Append -100 for special tokens (like [CLS], [SEP]) or padding
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Assign the label corresponding to the word index
                label_ids.append(label2id[label[word_idx]])
            else:
                # For subword tokens, repeat the label of the first subword token
                label_ids.append(label2id[label[word_idx]])
                    
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [13]:
final_dataset = DatasetDict({
    'train': tokenized_train_dataset,
    'test': tokenized_test_dataset
})
final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 180000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [14]:
# for example in final_dataset["train"]:
#     assert len(example["input_ids"]) == len(example["labels"]), "Mismatch between input and label lengths"

In [15]:
final_dataset = final_dataset.select_columns(['input_ids', 'attention_mask', 'labels'])
final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 180000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [16]:
batch_pre_collation = [ final_dataset["train"][i] for i in range(1) ]
batch_pre_collation[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [17]:
for example in batch_pre_collation:
    print(example)
    print(f">>> Length: {len(example['input_ids'])} {len(example['labels'])}")
    for token, label in zip(tokenizer.convert_ids_to_tokens(example["input_ids"]),example["labels"]): 
        print(f"{token:~<20} {label}") 

{'input_ids': [101, 1407, 118, 16892, 118, 1572, 1407, 131, 3887, 131, 4389, 156, 22083, 1113, 8907, 111, 14009, 1181, 2950, 18611, 1545, 119, 5581, 5731, 22737, 119, 1695, 161, 3190, 23435, 20581, 838, 1120, 7102, 2107, 1183, 1942, 16669, 119, 138, 1964, 1233, 149, 1306, 1204, 131, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 6, 6, 6, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -100]}
>>> Length: 47 47
[CLS]~~~~~~~~~~~~~~~ -100
18~~~~~~~~~~~~~~~~~~ 2
-~~~~~~~~~~~~~~~~~~~ 2
Aug~~~~~~~~~~~~~~~~~ 2
-~~~~~~~~~~~~~~~~~~~ 2
24~~~~~~~~~~~~~~~~~~ 2
18~~~~~~~~~~~~~~~~~~ 5
:~~~~~~~~~~~~~~~~~~~ 5
43~~~~~~~~~~~~~~~~~~ 5
:~~~~~~~~~~~~~~~~~~~ 5
53~~~~~~~~~~~~~~~~~~ 5
S~~~~~~~~~~~~~~~~~~~ 6
##pent~~~~~~~~~~~~~~ 6
on~~~~~~~~~~~~~~~~~~ 6
Punjab~~~~~~~~~~~~~~ 1
&~~~~~~~~~~~~~~~~~~~ 1
S

In [18]:
list(unique_labels)

[' TIME\n',
 ' MONEY\n',
 ' BANK\n',
 ' ACCOUNT_NUMBER\n',
 ' DATE\n',
 ' TRANSACTION_TYPE\n',
 ' OTHER\n']

In [19]:
id2label

{0: ' ACCOUNT_NUMBER\n',
 1: ' BANK\n',
 2: ' DATE\n',
 3: ' MONEY\n',
 4: ' OTHER\n',
 5: ' TIME\n',
 6: ' TRANSACTION_TYPE\n'}

In [20]:
label2id

{' ACCOUNT_NUMBER\n': 0,
 ' BANK\n': 1,
 ' DATE\n': 2,
 ' MONEY\n': 3,
 ' OTHER\n': 4,
 ' TIME\n': 5,
 ' TRANSACTION_TYPE\n': 6}

In [21]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(tokenizer_name, num_labels=len(list(unique_labels))
                                                        , id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING']="0"
os.environ['TORCH_USE_CUDA_DSA'] = "0"

In [23]:
os.environ['WANDB_DISABLED'] = 'true'

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset['test'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)
print("Trainer initialized.")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer initialized.


In [26]:
print('start training model')
trainer.train()

start training model


Epoch,Training Loss,Validation Loss
1,0.0,5e-06
2,0.0,1e-06
3,0.0,0.0


TrainOutput(global_step=16875, training_loss=0.0009498797742536462, metrics={'train_runtime': 5200.2888, 'train_samples_per_second': 103.84, 'train_steps_per_second': 3.245, 'total_flos': 1.6915646868912e+16, 'train_loss': 0.0009498797742536462, 'epoch': 3.0})

In [27]:
# Save the model and tokenizer
output_dir = "/kaggle/working/model"

# Create the directory if it does not exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('/kaggle/working/model/tokenizer_config.json',
 '/kaggle/working/model/special_tokens_map.json',
 '/kaggle/working/model/vocab.txt',
 '/kaggle/working/model/added_tokens.json',
 '/kaggle/working/model/tokenizer.json')

In [28]:
# Replace with the path to your fine-tuned model
model_checkpoint = "/kaggle/working/model"

# Load tokenizer and model
tuned_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tuned_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [29]:
from transformers import pipeline

ner_pipeline = pipeline(
    'ner',
    model=tuned_model,
    tokenizer=tuned_tokenizer,
    device=0 if torch.cuda.is_available() else -1 # use GPU if available
)


In [30]:
# Example text for prediction
text = "399.00 spent on ICICI Bank Card XX1000 on 16-Jun-22 at Amazon. Avl Lmt: INR 1,38,768.54."

In [31]:
predictions = ner_pipeline(text)

for prediction in predictions:
    print(prediction)
#     print(f"Entity: {prediction['entity']}, Label: {prediction['label']}, Score: {prediction['score']}")

{'entity': ' MONEY\n', 'score': 0.9999988, 'index': 1, 'word': '39', 'start': 0, 'end': 2}
{'entity': ' MONEY\n', 'score': 0.99999976, 'index': 2, 'word': '##9', 'start': 2, 'end': 3}
{'entity': ' MONEY\n', 'score': 0.99999976, 'index': 3, 'word': '.', 'start': 3, 'end': 4}
{'entity': ' MONEY\n', 'score': 0.9999987, 'index': 4, 'word': '00', 'start': 4, 'end': 6}
{'entity': ' TRANSACTION_TYPE\n', 'score': 0.99997497, 'index': 5, 'word': 'spent', 'start': 7, 'end': 12}
{'entity': ' TRANSACTION_TYPE\n', 'score': 0.99999726, 'index': 6, 'word': 'on', 'start': 13, 'end': 15}
{'entity': ' BANK\n', 'score': 0.99999917, 'index': 7, 'word': 'I', 'start': 16, 'end': 17}
{'entity': ' BANK\n', 'score': 0.9999993, 'index': 8, 'word': '##CI', 'start': 17, 'end': 19}
{'entity': ' BANK\n', 'score': 0.9999993, 'index': 9, 'word': '##CI', 'start': 19, 'end': 21}
{'entity': ' BANK\n', 'score': 0.9999993, 'index': 10, 'word': 'Bank', 'start': 22, 'end': 26}
{'entity': ' BANK\n', 'score': 0.9999988, 'inde

In [32]:
def predict(text: str):
    model = tuned_model
    model.to(device)
    # Tokenize the text
    tokens = tuned_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**tokens)
    logits = outputs.logits

    # Get the predicted labels
    predictions = torch.argmax(logits, dim=-1).squeeze().cpu().tolist()
    tokens = tokens["input_ids"].squeeze().cpu().tolist()

    # Convert predictions to label names
    labels = [model.config.id2label[pred] for pred in predictions]
#     print(labels)
    tokens = [tuned_tokenizer.decode([token]) for token in tokens]

    # Filter out special tokens
    result = [{"token": token, "label": label} for token, label in zip(tokens, labels) if token not in tokenizer.all_special_tokens]

    return {"tokens": result}


In [33]:
predict(text)['tokens']

[{'token': '39', 'label': ' MONEY\n'},
 {'token': '##9', 'label': ' MONEY\n'},
 {'token': '.', 'label': ' MONEY\n'},
 {'token': '00', 'label': ' MONEY\n'},
 {'token': 'spent', 'label': ' TRANSACTION_TYPE\n'},
 {'token': 'on', 'label': ' TRANSACTION_TYPE\n'},
 {'token': 'I', 'label': ' BANK\n'},
 {'token': '##CI', 'label': ' BANK\n'},
 {'token': '##CI', 'label': ' BANK\n'},
 {'token': 'Bank', 'label': ' BANK\n'},
 {'token': 'Card', 'label': ' BANK\n'},
 {'token': 'X', 'label': ' ACCOUNT_NUMBER\n'},
 {'token': '##X', 'label': ' ACCOUNT_NUMBER\n'},
 {'token': '##100', 'label': ' ACCOUNT_NUMBER\n'},
 {'token': '##0', 'label': ' ACCOUNT_NUMBER\n'},
 {'token': 'on', 'label': ' DATE\n'},
 {'token': '16', 'label': ' DATE\n'},
 {'token': '-', 'label': ' DATE\n'},
 {'token': 'Jun', 'label': ' DATE\n'},
 {'token': '-', 'label': ' DATE\n'},
 {'token': '22', 'label': ' DATE\n'},
 {'token': 'at', 'label': ' OTHER\n'},
 {'token': 'Amazon', 'label': ' OTHER\n'},
 {'token': '.', 'label': ' OTHER\n'},
 