In [None]:
# Step 1: Import the HuggingFace datasets library
from datasets import load_dataset
# Step 2: Define the list of Indian language codes
arr=['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']
# Step 3: Load the Naamapadam dataset for each language and store in a list
k=[]
for i in arr:
    k.append(load_dataset("ai4bharat/naamapadam",i))

In [None]:
# Step 4: Display the loaded datasets for all languages
k

[DatasetDict({
     train: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 10266
     })
     test: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 51
     })
     validation: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 52
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 961679
     })
     test: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 607
     })
     validation: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 4859
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 472845
     })
     test: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 1076
     })
     validation: Dataset({
         features: ['tokens', 'ner_tags'],
         num_rows: 2389
     })
 }),
 DatasetDict({
     train: Dataset({
         features:

In [None]:
# Step 5: Import DatasetDict and concatenate_datasets for merging datasets
from datasets import DatasetDict, concatenate_datasets
# Step 6: Combine all train datasets
combined_train = concatenate_datasets([dd['train'] for dd in k])
# Step 7: Combine all test datasets
combined_test = concatenate_datasets([dd['test'] for dd in k])
# Step 8: Combine all validation datasets
combined_validation = concatenate_datasets([dd['validation'] for dd in k])
# Step 9: Create the final combined DatasetDict
combined_dataset = DatasetDict({
    'train': combined_train,
    'test': combined_test,
    'validation': combined_validation
})
# Step 10: Print and display the combined dataset
print(combined_dataset)
combined_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5740190
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9266
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 37887
    })
})


In [None]:
# Step 11: Get the NER tag feature from the first language's train split
ner_feature = k[0]["train"].features["ner_tags"]
# ner_feature

In [None]:
# Step 12: Extract label names for NER tags
label_names = ner_feature.feature.names
# label_names

In [None]:
# Step 13: Load the tokenizer from HuggingFace Transformers
from transformers import AutoTokenizer
model_name= "google-bert/bert-base-uncased"
model_checkpoint =model_name
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Step 14: Function to align NER labels with tokenized words
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [None]:
# Step 15: Tokenize and align labels for the dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
# Step 16: Apply tokenization and label alignment to the combined dataset
tokenized_datasets = combined_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=combined_dataset["train"].column_names,
)

Map:   0%|          | 0/5740190 [00:00<?, ? examples/s]

Map:   0%|          | 0/9266 [00:00<?, ? examples/s]

Map:   0%|          | 0/37887 [00:00<?, ? examples/s]

In [None]:
# Step 17: Import and create a data collator for token classification
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

2024-06-15 04:51:47.817192: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 04:51:47.817257: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 04:51:47.818765: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Step 18: Create a batch and inspect the labels
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         -100, -100]])

In [None]:
# Step 19: Print the labels for the first two training samples
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
# Step 20: Install evaluation libraries
!pip install seqeval evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
# Step 21: Import the evaluate library and load the seqeval metric
import evaluate
metric = evaluate.load("seqeval")

In [None]:
# Step 22: Assign the combined dataset to raw_datasets for evaluation
raw_datasets=combined_dataset

In [None]:
# Step 23: Convert NER tag indices to label names for the first training sample
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'O', 'O', 'O', 'O', 'O']

In [None]:
# Step 24: Create dummy predictions and compute metrics
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [None]:
# Step 25: Define compute_metrics function for evaluation during training
import numpy as np
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
# Step 26: Create id2label and label2id mappings for the model
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
# print(id2label)
# print(label2id)

In [None]:
# Step 27: Load the token classification model from HuggingFace
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 28: List files in the current directory (for debugging)
import os
os.listdir(".")

['wandb', 'state.db', 'results']

In [None]:
# Step 29: Remove previous trainer directory if exists
import shutil
shutil.rmtree('./tmp_trainer')

FileNotFoundError: [Errno 2] No such file or directory: './tmp_trainer'

In [None]:
# Step 30: Trainer setup placeholder

In [None]:
# Step 31: Import necessary modules for training
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import Trainer
# Step 32: Load the model and tokenizer again (redundant, but kept for clarity)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Step 33: Define training arguments
from transformers import TrainingArguments
args = TrainingArguments(
    evaluation_strategy="epoch",
    learning_rate=2e-5,  # Adjusted learning rate
    per_device_train_batch_size=16,  # Adjusted batch size
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Adjusted number of epochs
    weight_decay=0.01, # Limit the total amount of checkpoints. Deletes the older checkpoints.
    load_best_model_at_end=True,  # Load the best model found at the end of training
    metric_for_best_model="f1",  # Use F1 score to determine the best model
)
# Step 34: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
# Step 35: Train the model and push to HuggingFace Hub
trainer.train()
trainer.push_to_hub(commit_message="Training complete")