In [4]:
import spacy, re
import numpy as np
import torch
from skweak import heuristics, gazetteers, generative, utils
from skweak import aggregation, utils
from transformers import pipeline, AutoModelForTokenClassification, TrainingArguments, Trainer
from skweak.base import SpanAnnotator
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline




  from .autonotebook import tqdm as notebook_tqdm


In [5]:

class HuggingFaceAnnotator(SpanAnnotator):
    def __init__(self, name, model, tokenizer, label2id, train_dataset, val_dataset, test_dataset):
        super().__init__(name)
        self.model = model
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.id2label = {id: label for label, id in label2id.items()}
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.name = name
        
    def tokenize_and_align_labels(self, examples):
        label_all_tokens = False
        tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

        labels = []
        #for wikineural this would be called ner_tags but for wikiann it is tags
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    def reverse_map_predictions(self, original_tokens, tokenized_inputs, predictions):
        # Assuming predictions are aligned with the tokenized input (subtokens)
        word_ids = tokenized_inputs.word_ids()  # Get word IDs for all tokens in the batch
        reversed_predictions = []
        current_word_id = None
        current_word_predictions = []
        
        for word_id, prediction in zip(word_ids, predictions):
            if word_id is None:
                # Skipping special tokens like [CLS], [SEP], etc.
                continue
            
            if word_id != current_word_id:
                # Encountering a new word, decide the label for the previous word
                if current_word_predictions:
                    # You can implement different strategies here
                    # For simplicity, taking the first prediction for the word
                    reversed_predictions.append(current_word_predictions[0])
                current_word_predictions = [prediction]
                current_word_id = word_id
            else:
                # Accumulating predictions for subtokens of the same word
                current_word_predictions.append(prediction)
        
        # Don't forget to add the prediction for the last word
        if current_word_predictions:
            reversed_predictions.append(current_word_predictions[0])

        return [original_tokens, reversed_predictions]
    
    def compute_metrics(self, p):
        metric = load_metric("seqeval")
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [list(self.label2id.keys())[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [list(self.label2id.keys())[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    
    #Runs model on a single example
    def predict_single_example(self, example):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #tokenize input tokens
        tokenized = self.tokenizer(example, truncation=True, is_split_into_words=True)
        
    
        input_token = torch.tensor([tokenized['input_ids']]).to(device)
        model = self.model.to(device)
        #make prediction 
        
        predictions = model(input_token)
        model_logits = predictions.logits 
        model_predictions = torch.argmax(model_logits, dim=-1)
        return model_predictions
    
    #Evaluates model and runs prediction on the whole test dataset 
    def predict_tag(self):
        data_collator = DataCollatorForTokenClassification(self.tokenizer)
        #Tokenize the inputs to get the word ids. 
        print("tokenizing dataset")
        tokenized_traindataset = self.train_dataset.map(self.tokenize_and_align_labels, batched =True)
        tokenized_testdataset = self.test_dataset.map(self.tokenize_and_align_labels, batched=True)
        tokenized_evaldataset = self.val_dataset.map(self.tokenize_and_align_labels, batched=True)
        #Prepare model for pytorch dataloader 
        #tokenized_dataset.set_format(type='torch', columns=['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])
        self.model.eval()
        print("Defining a trainer object")
        args = TrainingArguments(
            "test",
            evaluation_strategy = "steps",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=1,
            weight_decay=0.01,
            push_to_hub=True,
            eval_steps=20000,
            save_steps=20000,
        )
        trainer = Trainer(
            self.model,
            args,
            train_dataset =tokenized_traindataset, 
            eval_dataset = tokenized_evaldataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )
        
        print("Evaluate Using Model")
        print(trainer.evaluate())
        print("Predict Using Model")
        predictions, labels, _ = trainer.predict(tokenized_testdataset)
        predictions = np.argmax(predictions, axis=2)
        return predictions

# Load Models

In [6]:
wikineural_tags_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
wikineural_tags_to_int = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
wikineural_int_to_tags = {v: k for k,v in wikineural_tags_to_int.items()}

In [7]:

model_name = "Babelscape/wikineural-multilingual-ner"

wikineural_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
wikineural_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(wikineural_tags_list), label2id = wikineural_tags_to_int, id2label = wikineural_int_to_tags)


In [8]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

model_name = "DunnBC22/roberta-base-finetuned-WikiNeural"

roberta_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
roberta_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(wikineural_tags_list), label2id = wikineural_tags_to_int, id2label = wikineural_int_to_tags)

# Load Dataset

In [9]:

wikineural_datasets = load_dataset("Babelscape/wikineural")
wikineural_train_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

wikineural_val_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

wikineural_test_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

# Temp

In [9]:
wikineural_train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 3000
})

In [10]:
tokenized = wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True)

In [11]:
tokenized['input_ids']

[101,
 10747,
 13751,
 10379,
 18286,
 10105,
 19561,
 105868,
 83302,
 117,
 11816,
 10114,
 10105,
 11621,
 10173,
 11849,
 119,
 102]

In [12]:
wikineural_tokenizer.decode(tokenized['input_ids'])

'[CLS] This division also contains the Ventana Wilderness, home to the California condor. [SEP]'

We see that we are able to get the word mapping of the tokenizer. This allows us to map the further tokenized words back to the original word tokens. Hence we use this to get the model's predictions in the token and tag dimensions of the original dataset. 

In [13]:
wikineural_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['[CLS]',
 'This',
 'division',
 'also',
 'contains',
 'the',
 'Ve',
 '##ntana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'con',
 '##dor',
 '.',
 '[SEP]']

In [14]:
wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True).word_ids()

[None, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, None]

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
wikineural_input_token = torch.tensor([tokenized['input_ids']]).to(device)
wikineural_input_token

tensor([[   101,  10747,  13751,  10379,  18286,  10105,  19561, 105868,  83302,
            117,  11816,  10114,  10105,  11621,  10173,  11849,    119,    102]],
       device='cuda:0')

In [22]:
wikineural_model= wikineural_model.to(device)
model_prediction = wikineural_model(wikineural_input_token)

In [23]:
model_logits = model_prediction.logits 
model_predictions = torch.argmax(model_logits, dim=-1)

In [24]:
model_predictions[0]

tensor([0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0], device='cuda:0')

original tokens

In [25]:
wikineural_train_dataset[0]["tokens"]

['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.']

In [76]:
def reverse_map_predictions(original_tokens, tokenized_inputs, predictions):
    # Assuming predictions are aligned with the tokenized input (subtokens)
    word_ids = tokenized_inputs.word_ids()  # Get word IDs for all tokens in the batch
    reversed_predictions = []
    current_word_id = None
    current_word_predictions = []
    
    for word_id, prediction in zip(word_ids, predictions):
        if word_id is None:
            # Skipping special tokens like [CLS], [SEP], etc.
            continue
        
        if word_id != current_word_id:
            # Encountering a new word, decide the label for the previous word
            if current_word_predictions:
                # You can implement different strategies here
                # For simplicity, taking the first prediction for the word
                reversed_predictions.append(current_word_predictions[0])
            current_word_predictions = [prediction]
            current_word_id = word_id
        else:
            # Accumulating predictions for subtokens of the same word
            current_word_predictions.append(prediction)
    
    # Don't forget to add the prediction for the last word
    if current_word_predictions:
        reversed_predictions.append(current_word_predictions[0])

    return [original_tokens, reversed_predictions]

In [27]:
reverse_map_predictions(wikineural_train_dataset[0]["tokens"], tokenized, model_predictions[0].tolist())

[['This',
  'division',
  'also',
  'contains',
  'the',
  'Ventana',
  'Wilderness',
  ',',
  'home',
  'to',
  'the',
  'California',
  'condor',
  '.'],
 [0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]]

In [28]:
wikineural_train_dataset[0]['ner_tags']

[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]

# Initialize the HuggingFaceAnnotator

In [10]:
wikineural_annotator = HuggingFaceAnnotator("wikineural_annotator", wikineural_model, wikineural_tokenizer, wikineural_tags_to_int,wikineural_train_dataset,wikineural_val_dataset, wikineural_test_dataset  )


In [11]:
roberta_annotator = HuggingFaceAnnotator("roberta_annotator", roberta_model, roberta_tokenizer, wikineural_tags_to_int, wikineural_train_dataset, wikineural_val_dataset, wikineural_test_dataset)

In [31]:
wikineural_predictions = predictions = wikineural_annotator.predict_tag()

tokenizing dataset
Defining a trainer object
Evaluate Using Model


100%|██████████| 375/375 [00:05<00:00, 66.32it/s] 


{'eval_loss': 0.009427888318896294, 'eval_precision': 0.9722115753162132, 'eval_recall': 0.9755769230769231, 'eval_f1': 0.9738913419082359, 'eval_accuracy': 0.9967609909470374, 'eval_runtime': 5.6887, 'eval_samples_per_second': 527.359, 'eval_steps_per_second': 65.92}
Predict Using Model


100%|██████████| 375/375 [00:05<00:00, 68.03it/s] 


In [32]:
roberta_predictions = roberta_annotator.predict_tag()

tokenizing dataset
Defining a trainer object
Evaluate Using Model


100%|██████████| 375/375 [00:06<00:00, 61.39it/s] 


{'eval_loss': 0.07395368814468384, 'eval_precision': 0.8651873951146746, 'eval_recall': 0.8923076923076924, 'eval_f1': 0.8785382940452523, 'eval_accuracy': 0.9829105003523608, 'eval_runtime': 6.127, 'eval_samples_per_second': 489.638, 'eval_steps_per_second': 61.205}
Predict Using Model


100%|██████████| 375/375 [00:06<00:00, 62.19it/s] 


# Aggregate Annotators. 

In [15]:
import spacy
from spacy.tokens import Span
from skweak import utils, aggregation

# Assume `dataset` is your dataset, and `annotators` is a list of your HuggingFaceAnnotator instances
nlp = spacy.blank("xx")  # or load an existing SpaCy model

# Function to apply HuggingFaceAnnotator predictions as skweak annotations
def apply_annotations(tokens, annotator):
    
   
    predictions = annotator.predict_single_example(tokens)
    #map the predictions back to the original space
    spaces = [True if token not in [',', '.'] else False for token in tokens[:-1]] + [False]
    tokenized = wikineural_tokenizer(tokens, truncation=True, is_split_into_words=True)
    mapped_to_original = annotator.reverse_map_predictions(tokens, tokenized,predictions[0].tolist())
    #convert example which is a list of strings into a doc object so that our skweak aggregator understands
    doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces=spaces)
   
   
    
    # Initialize the span group for this annotator if it doesn't exist
    if annotator.name not in doc.spans:
        doc.spans[annotator.name] = []
    for token, label_id in zip(doc, mapped_to_original[1]):
        label = annotator.id2label[label_id]
        # Create a skweak annotation for each token. Adjust the span as needed.
        span = Span(doc, token.i, token.i + 1, label=label)
        doc.spans[annotator.name].append(span)
    return doc


In [105]:
wikineural_train_dataset[0]['tokens']

['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.']

In [106]:
tokenized = wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True)

In [108]:
wikineural_model= wikineural_model.to(device)
model_prediction = wikineural_model(wikineural_input_token)

In [109]:
model_logits = model_prediction.logits 
model_predictions = torch.argmax(model_logits, dim=-1)

In [110]:
model_predictions

tensor([[0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0]],
       device='cuda:0')

In [121]:
#Runs model on a single example
def predict_single_example( example):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    #tokenize input tokens
    tokenized = wikineural_tokenizer(example, truncation=True, is_split_into_words=True)
    
  
    input_token = torch.tensor([tokenized['input_ids']]).to(device)
    model = wikineural_model.to(device)
    #make prediction 
    
    predictions = model(input_token)
    model_logits = predictions.logits 
    model_predictions = torch.argmax(model_logits, dim=-1)
    return model_predictions
    

In [122]:
predict_single_example(['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.'])

tensor([[0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0]],
       device='cuda:0')

In [22]:
doc = apply_annotations(['This','division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.'], wikineural_annotator)

# Annotator's name - replace 'wikineural_annotator' with the actual name attribute of your annotator if different
annotator_name = wikineural_annotator.name

# Check if the annotator's name exists in doc.spans to avoid KeyError
if annotator_name in doc.spans:
    # Iterate through the spans in the specific annotator's span group
    for span in doc.spans[annotator_name]:
        # Print the text of each span and its label
        print(f"Text: {span.text}, Label: {span.label_}")
else:
    print(f"No annotations found for {annotator_name}.")

Text: This, Label: O
Text: division, Label: O
Text: also, Label: O
Text: contains, Label: O
Text: the, Label: O
Text: Ventana, Label: B-LOC
Text: Wilderness, Label: I-LOC
Text: ,, Label: O
Text: home, Label: O
Text: to, Label: O
Text: the, Label: O
Text: California, Label: B-LOC
Text: condor, Label: O
Text: ., Label: O


In [25]:

annotators = [ roberta_annotator, wikineural_annotator]
for annotator in annotators:
    docs = [apply_annotations(sentence, annotator) for sentence in wikineural_train_dataset['tokens']]

# Now, use skweak's aggregators to combine the annotations
# For example, using the HMM aggregator
aggregator = aggregation.HMM("aggregated", list_of_label_spaces=[annotator.name for annotator in annotators])
aggregated_docs = aggregator.fit_transform(docs)

# `aggregated_docs` now contains the aggregated labels for each document

TypeError: HMM() got an unexpected keyword argument 'list_of_label_spaces'

In [38]:
import torch

# Assuming you want to check memory for the first GPU
gpu_index = 0

# Get total GPU memory
total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
# Convert bytes to GB for easier interpretation
total_memory_gb = total_memory / (1024 ** 3)

# Get current GPU memory allocated
current_memory_allocated = torch.cuda.memory_allocated(gpu_index)
# Convert bytes to GB
current_memory_allocated_gb = current_memory_allocated / (1024 ** 3)

# Get current GPU memory reserved by PyTorch's memory allocator
current_memory_reserved = torch.cuda.memory_reserved(gpu_index)
# Convert bytes to GB
current_memory_reserved_gb = current_memory_reserved / (1024 ** 3)

print(f"Total GPU Memory (GB): {total_memory_gb:.2f}")
print(f"Current Memory Allocated (GB): {current_memory_allocated_gb:.2f}")
print(f"Current Memory Reserved (GB): {current_memory_reserved_gb:.2f}")

Total GPU Memory (GB): 9.77
Current Memory Allocated (GB): 1.14
Current Memory Reserved (GB): 1.49
