#Setup

In [2]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval
!pip3 install transformers[torch]
!pip3 install accelerate -U

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m533.1 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=613dfd2a45dd9cc0dc64c9427bfb62627e23ca3896a76dca8e53b65cec94d542
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:0

In [3]:
from random import sample

# Running the IndicNER Model

Let's try annotating some Indian language sentences and get the named entities

In [4]:
# Import all the necessary classes and initialize the tokenizer and model.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")

model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [5]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1)

    # We will map the maximum predicted class id with the class label
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]

    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( predicted_tokens_classes[ word_index ] )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [6]:
# let us try with some example sentences here
sentence = 'अभी ये चर्चा चल ही रही थी कि दिग्विजय सिंह के पास मध्य प्रदेश के मुख्यमंत्री कमलनाथ का फोन आ गया'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )
for index in range(len(sentence.split(' '))):
    print('"'+predicted_labels[index]+'" , ',end='')
print()
for index in range(len(sentence.split(' '))):
    print(predicted_labels[index]+',',end='')

अभी	O
ये	O
चर्चा	O
चल	O
ही	O
रही	O
थी	O
कि	O
दिग्विजय	B-PER
सिंह	I-PER
के	O
पास	O
मध्य	B-LOC
प्रदेश	I-LOC
के	O
मुख्यमंत्री	O
कमलनाथ	B-PER
का	O
फोन	O
आ	O
गया	O
"O" , "O" , "O" , "O" , "O" , "O" , "O" , "O" , "B-PER" , "I-PER" , "O" , "O" , "B-LOC" , "I-LOC" , "O" , "O" , "B-PER" , "O" , "O" , "O" , "O" , 
O,O,O,O,O,O,O,O,B-PER,I-PER,O,O,B-LOC,I-LOC,O,O,B-PER,O,O,O,O,

In [7]:
sentence = 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

ಶರಣ್	B-PER
ರ	O
ನೀವು	O
ನೋಡಲೇಬೇಕಾದ	O
ಟಾಪ್	O
5	O
ಕಾಮಿಡಿ	O
ಚಲನಚಿತ್ರಗಳು	O


# Naampadam Dataset

The _Naampadam_ Dataset is a large dataset for Named Entity Recognition in 11 Indian languages.  _Naampadam_ means "named entity" in Sanskrit.

In [None]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)


Downloading builder script:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading and preparing dataset naamapadam_pr/hi to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/hi/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20...


Downloading data:   0%|          | 0.00/82.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# let's now print how the Dataset looks like
(raw_datasets)

In [None]:
raw_datasets['train']

In [None]:
sample_size = 20000
# Assuming your data is loaded in the "train" split
train_data = raw_datasets['train']

# Randomly sample indices from the entire "train" dataset
random_indices = sample(range(len(train_data)), sample_size)

# Select the subset based on the random indices
train_subset = train_data.select(random_indices)

raw_datasets['train']=train_subset

(raw_datasets)

In [None]:
raw_datasets.column_names

In [None]:
# let's print an instance of dataset
idx=1000
rec=raw_datasets['train'][idx]
for w, t in zip(rec['tokens'],rec['ner_tags']):
  print('{}\t{}'.format(w,t))


In [None]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

In [None]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [None]:
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)


# Training an NER Model with the dataset

We have already seen how to get predictions from fine-tuned NER model. We will now use the pre-trained IndicBERT model and fine-tune it for NER task.

Let us download a pre-trained model and fine-tune it for the task of NER. We will have to use the `AutoModelForTokenClassification` class to fine-tune the model

**Load Pre-trained Model**

In [None]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )

In [None]:
# Run the next cell if you want to use a GPU. Make sure that the Colab runtime is set accordingly

#model=model.to("cuda")

**Tokenize all texts and align the labels with them**

In [None]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

In [None]:
eval_dataset = raw_datasets["validation"]
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)
len(eval_dataset)

**Create Data Collator, Metrics**

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Metrics
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

**Set Training Arguments**

In [None]:
# args=TrainingArguments(output_dir='output_dir',max_steps=5)
args=TrainingArguments(
    output_dir='output_dir',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=2500,
    gradient_accumulation_steps=2,
    dataloader_num_workers=2,
    )


**Training**

In [None]:
# Initialize our Trainer
# early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)
# args.metric_for_best_model = "f1"
# args.load_best_model_at_end = True
# args.evaluation_strategy = IntervalStrategy.STEPS
# args.eval_steps = args.save_steps
# args.greater_is_better = True

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

In [None]:
trainer.args

In [35]:
train_result = trainer.train()
metrics = train_result.metrics

Step,Training Loss
500,0.4146
1000,0.3112
1500,0.284
2000,0.251
2500,0.2527
3000,0.2097
3500,0.2044




In [None]:

metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)

In [None]:
model.save_pretrained('path/to/save/model')
tokenizer.save_pretrained('path/to/save/tokenizer')

# Evaluating on Test Data

In [23]:
tokenized_test_set = raw_datasets['test'].map(
      tokenize_and_align_labels,
      batched=True,
      num_proc=32,
      load_from_cache_file=True,
      desc="Running tokenizer on test dataset",
  )
len(tokenized_test_set)

                                     

Running tokenizer on test dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #4:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Running tokenizer on test dataset #5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #6:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #7:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on test dataset #8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #10:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on test dataset #13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #15:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #14:   0%|          | 0/1 [00:00<?, ?ba/s]

      

Running tokenizer on test dataset #18:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #17:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #21:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #20:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on test dataset #23:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Running tokenizer on test dataset #24:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #25:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #26:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on test dataset #28:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #29:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #31:   0%|          | 0/1 [00:00<?, ?ba/s]

867

In [44]:
predictions, labels, metrics = trainer.predict(tokenized_test_set)

trainer.log_metrics("eval", metrics)

***** eval metrics *****
  test_LOC_f1             =     0.7002
  test_LOC_number         =        614
  test_LOC_precision      =     0.7083
  test_LOC_recall         =     0.6922
  test_ORG_f1             =     0.6383
  test_ORG_number         =        525
  test_ORG_precision      =     0.6076
  test_ORG_recall         =     0.6724
  test_PER_f1             =     0.7335
  test_PER_number         =        790
  test_PER_precision      =     0.7267
  test_PER_recall         =     0.7405
  test_loss               =     0.2226
  test_overall_accuracy   =     0.9334
  test_overall_f1         =     0.6963
  test_overall_precision  =     0.6863
  test_overall_recall     =     0.7066
  test_runtime            = 0:00:16.72
  test_samples_per_second =     51.853
  test_steps_per_second   =      6.519


In [None]:
predictions, labels, metrics = trainer.predict(train_dataset)

trainer.log_metrics("train", metrics)

# Load pretrained model from disk

In [None]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('path/to/save/model')
tokenizer = AutoTokenizer.from_pretrained('path/to/save/tokenizer')
model = AutoModelForTokenClassification.from_pretrained('path/to/save/model',config=config )

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Loading any given data so that it can be evaluated

In [82]:
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Value, Features
import json

# Define the path to your JSONL file
jsonl_file_path = '/kaggle/input/assignment2/25_1_1.json'

# Define the features schema
features = Features({
    "words": Sequence(feature=Value(dtype="string")),
    "ner": Sequence(feature=ClassLabel(names=["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]))
})

# Function to read JSONL file and parse data
def read_jsonl_data(file_path):
    data = {'words': [], 'ner': []}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            record = json.loads(line.strip())
            data['words'].append(record['words'])
            data['ner'].append(record['ner'])
    return data

# Read the data
data = read_jsonl_data(jsonl_file_path)

# Create a Dataset from the data with the correct features schema
dataset = Dataset.from_dict(data, features=features)

# Optionally, create a DatasetDict if you want to have multiple splits (e.g., train, validation)
dataset_dict = DatasetDict({
    'train': dataset  # You can add more splits as needed
})

print(dataset_dict)
print(dataset_dict['train'].features)


DatasetDict({
    train: Dataset({
        features: ['words', 'ner'],
        num_rows: 25
    })
})
{'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [83]:
# let's print an instance of dataset
idx=2
rec=dataset_dict['train'][idx]
for w, t in zip(rec['words'],rec['ner']):
  print('{}\t{}'.format(w,t))


इस	0
गेम	0
को	0
एयर	3
फोर्स	4
के	0
एयर	0
चीफ	0
मार्शल	0
बी	1
एस	2
धनोआ	2
ने	0
अधिकारिक	0
रूप	0
से	0
लॉन्च	0
किया	0


In [84]:
column_names = dataset_dict["train"].column_names
print(column_names)

features = dataset_dict["train"].features
print(features)

['words', 'ner']
{'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [85]:
text_column_name = "words"
label_column_name = "ner"

In [99]:
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_list)

num_labels = len(label_list)


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


In [87]:
data_1=dataset_dict['train']
data_1 = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)
len(data_1)

    

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 

Running tokenizer on Validation dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


 

Running tokenizer on Validation dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

25