In [1]:
import spacy, re
import numpy as np
import torch
import json
import pandas as pd 
from skweak import heuristics, gazetteers, generative, utils
from skweak import aggregation, utils
from transformers import pipeline, AutoModelForTokenClassification, TrainingArguments, Trainer
from skweak.base import SpanAnnotator
from pandas import read_parquet
from transformers import AutoModelForTokenClassification,BertTokenizerFast, AutoTokenizer
from datasets import load_dataset, load_metric, concatenate_datasets,Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline




  from .autonotebook import tqdm as notebook_tqdm


In [4]:

class HuggingFaceAnnotator(SpanAnnotator):
    def __init__(self, name, model, tokenizer, label2id, train_dataset, val_dataset, test_dataset):
        super().__init__(name)
        self.model = model
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.id2label = {id: label for label, id in label2id.items()}
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.name = name
        
    def tokenize_and_align_labels(self, examples):
        label_all_tokens = False
        tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

        labels = []
        #for wikineural this would be called ner_tags but for wikiann it is tags
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    def reverse_map_predictions(self, original_tokens, tokenized_inputs, predictions):
        # Assuming predictions are aligned with the tokenized input (subtokens)
        word_ids = tokenized_inputs.word_ids()  # Get word IDs for all tokens in the batch
        reversed_predictions = []
        current_word_id = None
        current_word_predictions = []
        
        for word_id, prediction in zip(word_ids, predictions):
            if word_id is None:
                # Skipping special tokens like [CLS], [SEP], etc.
                continue
            
            if word_id != current_word_id:
                # Encountering a new word, decide the label for the previous word
                if current_word_predictions:
                    # You can implement different strategies here
                    # For simplicity, taking the first prediction for the word
                    reversed_predictions.append(current_word_predictions[0])
                current_word_predictions = [prediction]
                current_word_id = word_id
            else:
                # Accumulating predictions for subtokens of the same word
                current_word_predictions.append(prediction)
        
        # Don't forget to add the prediction for the last word
        if current_word_predictions:
            reversed_predictions.append(current_word_predictions[0])

        return [original_tokens, reversed_predictions]
    
    def compute_metrics(self, p):
        metric = load_metric("seqeval")
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [list(self.label2id.keys())[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [list(self.label2id.keys())[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    
    #Runs model on a single example
    def predict_single_example(self, example):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #tokenize input tokens
        tokenized = self.tokenizer(example, truncation=True, is_split_into_words=True)
        
    
        input_token = torch.tensor([tokenized['input_ids']]).to(device)
        model = self.model.to(device)
        #make prediction 
        
        predictions = model(input_token)
        model_logits = predictions.logits 
        model_predictions = torch.argmax(model_logits, dim=-1)
        return model_predictions
    
    #Evaluates model and runs prediction on the whole test dataset 
    def predict_tag(self):
        data_collator = DataCollatorForTokenClassification(self.tokenizer)
        #Tokenize the inputs to get the word ids. 
        print("tokenizing dataset")
        tokenized_traindataset = self.train_dataset.map(self.tokenize_and_align_labels, batched =True)
        tokenized_testdataset = self.test_dataset.map(self.tokenize_and_align_labels, batched=True)
        tokenized_evaldataset = self.val_dataset.map(self.tokenize_and_align_labels, batched=True)
        #Prepare model for pytorch dataloader 
        #tokenized_dataset.set_format(type='torch', columns=['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])
        self.model.eval()
        print("Defining a trainer object")
        args = TrainingArguments(
            "test",
            evaluation_strategy = "steps",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=1,
            weight_decay=0.01,
            push_to_hub=True,
            eval_steps=20000,
            save_steps=20000,
        )
        trainer = Trainer(
            self.model,
            args,
            train_dataset =tokenized_traindataset, 
            eval_dataset = tokenized_evaldataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )
        
        print("Evaluate Using Model")
        print(trainer.evaluate())
        print("Predict Using Model")
        predictions, labels, metrics = trainer.predict(tokenized_testdataset)
        print("Test Metrics: ", metrics)
        predictions = np.argmax(predictions, axis=2)
        return predictions, labels, metrics

# Load Models

In [3]:
wikineural_tags_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
wikineural_tags_to_int = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
wikineural_int_to_tags = {v: k for k,v in wikineural_tags_to_int.items()}

In [4]:

model_name = "Babelscape/wikineural-multilingual-ner"

wikineural_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
wikineural_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(wikineural_tags_list), label2id = wikineural_tags_to_int, id2label = wikineural_int_to_tags)


In [5]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

model_name = "DunnBC22/roberta-base-finetuned-WikiNeural"

roberta_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
roberta_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(wikineural_tags_list), label2id = wikineural_tags_to_int, id2label = wikineural_int_to_tags)

# Load Dataset

In [6]:

wikineural_datasets = load_dataset("Babelscape/wikineural")
wikineural_train_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

wikineural_val_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

wikineural_test_dataset = concatenate_datasets([wikineural_datasets["train_en"].select(range(1000)), 
                                      wikineural_datasets["train_es"].select(range(1000)), 
                                      wikineural_datasets["train_fr"].select(range(1000))])

# Temp

In [9]:
wikineural_train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 3000
})

In [10]:
tokenized = wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True)

In [11]:
tokenized['input_ids']

[101,
 10747,
 13751,
 10379,
 18286,
 10105,
 19561,
 105868,
 83302,
 117,
 11816,
 10114,
 10105,
 11621,
 10173,
 11849,
 119,
 102]

In [12]:
wikineural_tokenizer.decode(tokenized['input_ids'])

'[CLS] This division also contains the Ventana Wilderness, home to the California condor. [SEP]'

We see that we are able to get the word mapping of the tokenizer. This allows us to map the further tokenized words back to the original word tokens. Hence we use this to get the model's predictions in the token and tag dimensions of the original dataset. 

In [13]:
wikineural_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['[CLS]',
 'This',
 'division',
 'also',
 'contains',
 'the',
 'Ve',
 '##ntana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'con',
 '##dor',
 '.',
 '[SEP]']

In [14]:
wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True).word_ids()

[None, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, None]

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
wikineural_input_token = torch.tensor([tokenized['input_ids']]).to(device)
wikineural_input_token

tensor([[   101,  10747,  13751,  10379,  18286,  10105,  19561, 105868,  83302,
            117,  11816,  10114,  10105,  11621,  10173,  11849,    119,    102]],
       device='cuda:0')

In [22]:
wikineural_model= wikineural_model.to(device)
model_prediction = wikineural_model(wikineural_input_token)

In [23]:
model_logits = model_prediction.logits 
model_predictions = torch.argmax(model_logits, dim=-1)

In [24]:
model_predictions[0]

tensor([0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0], device='cuda:0')

original tokens

In [25]:
wikineural_train_dataset[0]["tokens"]

['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.']

In [76]:
def reverse_map_predictions(original_tokens, tokenized_inputs, predictions):
    # Assuming predictions are aligned with the tokenized input (subtokens)
    word_ids = tokenized_inputs.word_ids()  # Get word IDs for all tokens in the batch
    reversed_predictions = []
    current_word_id = None
    current_word_predictions = []
    
    for word_id, prediction in zip(word_ids, predictions):
        if word_id is None:
            # Skipping special tokens like [CLS], [SEP], etc.
            continue
        
        if word_id != current_word_id:
            # Encountering a new word, decide the label for the previous word
            if current_word_predictions:
                # You can implement different strategies here
                # For simplicity, taking the first prediction for the word
                reversed_predictions.append(current_word_predictions[0])
            current_word_predictions = [prediction]
            current_word_id = word_id
        else:
            # Accumulating predictions for subtokens of the same word
            current_word_predictions.append(prediction)
    
    # Don't forget to add the prediction for the last word
    if current_word_predictions:
        reversed_predictions.append(current_word_predictions[0])

    return [original_tokens, reversed_predictions]

In [27]:
reverse_map_predictions(wikineural_train_dataset[0]["tokens"], tokenized, model_predictions[0].tolist())

[['This',
  'division',
  'also',
  'contains',
  'the',
  'Ventana',
  'Wilderness',
  ',',
  'home',
  'to',
  'the',
  'California',
  'condor',
  '.'],
 [0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]]

In [28]:
wikineural_train_dataset[0]['ner_tags']

[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]

# Initialize the HuggingFaceAnnotator

In [7]:
wikineural_tags_list2 = ['O', 'BPER', 'IPER', 'BORG', 'IORG', 'BLOC', 'ILOC', 'BMISC', 'IMISC']
wikineural_tags_to_int2 = {'O': 0, 'BPER': 1, 'IPER': 2, 'BORG': 3, 'IORG': 4, 'BLOC': 5, 'ILOC': 6, 'BMISC': 7, 'IMISC': 8}
wikineural_int_to_tags2 = {v: k for k,v in wikineural_tags_to_int.items()}

In [8]:
wikineural_annotator = HuggingFaceAnnotator("wikineural_annotator", wikineural_model, wikineural_tokenizer, wikineural_tags_to_int2,wikineural_train_dataset,wikineural_val_dataset, wikineural_test_dataset  )


In [9]:
roberta_annotator = HuggingFaceAnnotator("roberta_annotator", roberta_model, roberta_tokenizer, wikineural_tags_to_int2, wikineural_train_dataset, wikineural_val_dataset, wikineural_test_dataset)

In [10]:
wikineural_predictions = predictions = wikineural_annotator.predict_tag()

tokenizing dataset


Map: 100%|██████████| 3000/3000 [00:00<00:00, 13933.15 examples/s]


Defining a trainer object


KeyboardInterrupt: 

In [10]:
roberta_predictions = roberta_annotator.predict_tag()

tokenizing dataset


Map: 100%|██████████| 3000/3000 [00:00<00:00, 14104.73 examples/s]


Defining a trainer object
Evaluate Using Model


100%|██████████| 375/375 [00:06<00:00, 60.20it/s] 


{'eval_loss': 0.07395368814468384, 'eval_precision': 0.8651873951146746, 'eval_recall': 0.8923076923076924, 'eval_f1': 0.8785382940452523, 'eval_accuracy': 0.9829105003523608, 'eval_runtime': 6.255, 'eval_samples_per_second': 479.614, 'eval_steps_per_second': 59.952}
Predict Using Model


100%|██████████| 375/375 [00:06<00:00, 55.70it/s] 


# Aggregate Annotators. 

In [11]:
import spacy
from spacy.tokens import Span
from skweak import utils, aggregation

# Assume `dataset` is your dataset, and `annotators` is a list of your HuggingFaceAnnotator instances
nlp = spacy.blank("xx")  # or load an existing SpaCy model

# Function to apply HuggingFaceAnnotator predictions as skweak annotations
def apply_annotations(tokens, annotator):
    
   
    predictions = annotator.predict_single_example(tokens)
    #map the predictions back to the original space
    spaces = [True if token not in [',', '.'] else False for token in tokens[:-1]] + [False]
    tokenized = wikineural_tokenizer(tokens, truncation=True, is_split_into_words=True)
    mapped_to_original = annotator.reverse_map_predictions(tokens, tokenized,predictions[0].tolist())
    #convert example which is a list of strings into a doc object so that our skweak aggregator understands
    doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces=spaces)
   
   
    
    # Initialize the span group for this annotator if it doesn't exist
    if annotator.name not in doc.spans:
        doc.spans[annotator.name] = []
    for token, label_id in zip(doc, mapped_to_original[1]):
        label = annotator.id2label[label_id]
        # Create a skweak annotation for each token. Adjust the span as needed.
        span = Span(doc, token.i, token.i + 1, label=label)
        doc.spans[annotator.name].append(span)
    return doc


In [12]:
wikineural_train_dataset[0]['tokens']

['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.']

In [13]:
tokenized = wikineural_tokenizer(wikineural_train_dataset[0]["tokens"], truncation=True, is_split_into_words=True)

In [108]:
wikineural_model= wikineural_model.to(device)
model_prediction = wikineural_model(wikineural_input_token)

In [109]:
model_logits = model_prediction.logits 
model_predictions = torch.argmax(model_logits, dim=-1)

In [110]:
model_predictions

tensor([[0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0]],
       device='cuda:0')

In [14]:
#Runs model on a single example
def predict_single_example( example):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    #tokenize input tokens
    tokenized = wikineural_tokenizer(example, truncation=True, is_split_into_words=True)
    
  
    input_token = torch.tensor([tokenized['input_ids']]).to(device)
    model = wikineural_model.to(device)
    #make prediction 
    
    predictions = model(input_token)
    model_logits = predictions.logits 
    model_predictions = torch.argmax(model_logits, dim=-1)
    return model_predictions
    

In [122]:
predict_single_example(['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.'])

tensor([[0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0]],
       device='cuda:0')

In [22]:
doc = apply_annotations(['This','division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.'], wikineural_annotator)

# Annotator's name - replace 'wikineural_annotator' with the actual name attribute of your annotator if different
annotator_name = wikineural_annotator.name

# Check if the annotator's name exists in doc.spans to avoid KeyError
if annotator_name in doc.spans:
    # Iterate through the spans in the specific annotator's span group
    for span in doc.spans[annotator_name]:
        # Print the text of each span and its label
        print(f"Text: {span.text}, Label: {span.label_}")
else:
    print(f"No annotations found for {annotator_name}.")

Text: This, Label: O
Text: division, Label: O
Text: also, Label: O
Text: contains, Label: O
Text: the, Label: O
Text: Ventana, Label: B-LOC
Text: Wilderness, Label: I-LOC
Text: ,, Label: O
Text: home, Label: O
Text: to, Label: O
Text: the, Label: O
Text: California, Label: B-LOC
Text: condor, Label: O
Text: ., Label: O


In [15]:

annotators = [ roberta_annotator, wikineural_annotator]
for annotator in annotators:
    docs = [apply_annotations(sentence, annotator) for sentence in wikineural_train_dataset['tokens']]


In [16]:
wikineural_tags_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Following this tutorial https://github.com/NorskRegnesentral/skweak/wiki/Step-2:-Aggregation#basic-setup

In [17]:
wikineural_tags_list2 = ['O', 'BPER', 'IPER', 'BORG', 'IORG', 'BLOC', 'ILOC', 'BMISC', 'IMISC']

In [26]:
tags_map_to_int_non_prefixed = {'O': 0, 'BPER': 1, 'IPER': 2, 'BORG': 3, 'IORG': 4, 'BLOC': 5, 'ILOC': 6, 'BMISC': 7, 'IMISC': 8}

In [18]:

# Now, use skweak's aggregators to combine the annotations
# For example, using the HMM aggregator
aggregator = aggregation.HMM("aggregated", out_labels = wikineural_tags_list2)
# aggregator.add_label_group("PER", ["I-PER", "B-PER"])
# aggregator.add_label_group("ORG", ["I-ORG", "B-ORG"])
# aggregator.add_label_group("LOC", ["B-LOC", "I-LOC"])
# aggregator.add_label_group("MISC", ["B-MISC","I-MISC"] )
# aggregator = aggregation.HMM("aggregated", out_labels = wikineural_tags_list)
aggregated_docs = aggregator.fit(docs)

# `aggregated_docs` now contains the aggregated labels for each document

Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 2


         1  -51447.50647039             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 3


         2  -30208.93105272  +21238.57541767


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents


         3  -30208.93105272      -0.00000000


In [19]:
docs[0]

This division also contains the Ventana Wilderness ,home to the California condor .

In [27]:
docs[1]

" So here is the balance NBC has to consider : The Who ,' Animal Practice ' .

In [28]:
docs[2]

It is a protest song that " creates a cinematic vista that tells of the singer 's search for a literal and physical America that seems to have disappeared ,along with the country 's beauty and ideals " .

In [20]:
docs = list(aggregator.pipe(docs))
utils.display_entities(docs[0], "aggregated")
utils.display_entities(docs[1], "aggregated")

In [18]:
def convert_doc_to_lists(doc_object): 
    # Initialize empty lists to store your sentence samples and their tags
    sentence_samples = []
    sentence_tags = []

    for doc in doc_object:
        # Initialize temporary lists for the current sentence sample
        tokens = []
        tags = []
        
        # Iterate over each token in the Doc
        for token in doc:
            # Append the token text to the tokens list
            tokens.append(token.text)
            
            # Check if there's an aggregated tag for this token
            # Assuming the aggregated tags are stored in a SpanGroup named "aggregated"
            if "aggregated" in doc.spans and doc.spans["aggregated"]:
                # Find any span that includes this token
                span = next((span for span in doc.spans["aggregated"] if span.start <= token.i < span.end), None)
                if span:
                    tags.append(span.label_)  # Use the label of the span
                else:
                    tags.append('O')  # Default to 'O' if no span includes this token
            else:
                # If there are no aggregated spans, default to 'O'
                tags.append('O')
        
        # Append the tokens and tags for this sentence to your main lists
        sentence_samples.append(tokens)
        sentence_tags.append(tags)
    return sentence_samples, sentence_tags

# Now, `sentence_samples` and `sentence_tags` contain your data in the desired format


In [22]:
sentence_samples, sentence_tags = convert_doc_to_lists(docs)

In [23]:
sentence_samples[0]

['This',
 'division',
 'also',
 'contains',
 'the',
 'Ventana',
 'Wilderness',
 ',',
 'home',
 'to',
 'the',
 'California',
 'condor',
 '.']

In [28]:
[tags_map_to_int_non_prefixed[tag] for tag in sentence_tags[0]]

[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]

In [25]:
wikineural_train_dataset[0]["ner_tags"]

[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]

In [38]:
import torch

# Assuming you want to check memory for the first GPU
gpu_index = 0

# Get total GPU memory
total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
# Convert bytes to GB for easier interpretation
total_memory_gb = total_memory / (1024 ** 3)

# Get current GPU memory allocated
current_memory_allocated = torch.cuda.memory_allocated(gpu_index)
# Convert bytes to GB
current_memory_allocated_gb = current_memory_allocated / (1024 ** 3)

# Get current GPU memory reserved by PyTorch's memory allocator
current_memory_reserved = torch.cuda.memory_reserved(gpu_index)
# Convert bytes to GB
current_memory_reserved_gb = current_memory_reserved / (1024 ** 3)

print(f"Total GPU Memory (GB): {total_memory_gb:.2f}")
print(f"Current Memory Allocated (GB): {current_memory_allocated_gb:.2f}")
print(f"Current Memory Reserved (GB): {current_memory_reserved_gb:.2f}")

Total GPU Memory (GB): 9.77
Current Memory Allocated (GB): 1.14
Current Memory Reserved (GB): 1.49


# Test Wikineural on our dataset

In [41]:
data_train = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/train.parquet")
data_dev = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/dev.parquet")
data_test = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/test.parquet")

In [43]:
train_ds = Dataset.from_pandas(data_train)
validation_ds = Dataset.from_pandas(data_dev)
test_ds = Dataset.from_pandas(data_test)

ds = DatasetDict()

ds['train'] = train_ds
ds['validation'] = validation_ds
ds['test'] = test_ds

In [4]:
with open(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/tags_2_idx.json","r") as file:
    tags_2_idx = json.load(file)
file.close()
idx_2_tags = {tags_2_idx[tag]:tag for tag in tags_2_idx.keys()}

In [8]:
model_name = "Babelscape/wikineural-multilingual-ner"

wikineural_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
wikineural_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(idx_2_tags), label2id = tags_2_idx, id2label = idx_2_tags)

In [9]:
wikineural_annotator = HuggingFaceAnnotator("wikineural_annotator", wikineural_model, wikineural_tokenizer, tags_2_idx,train_ds,validation_ds, test_ds )

In [32]:
wikineural_predictions, labels, metrics = wikineural_annotator.predict_tag()

tokenizing dataset


Map: 100%|██████████| 120200/120200 [00:06<00:00, 19743.66 examples/s]
Map: 100%|██████████| 40100/40100 [00:01<00:00, 21412.85 examples/s]
Map: 100%|██████████| 40100/40100 [00:02<00:00, 19025.39 examples/s]


Defining a trainer object
Evaluate Using Model


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5013/5013 [01:13<00:00, 67.87it/s]


{'eval_loss': 1.7594166994094849, 'eval_precision': 0.5581081331039713, 'eval_recall': 0.581267217630854, 'eval_f1': 0.5694523081568715, 'eval_accuracy': 0.7615307808289491, 'eval_runtime': 73.9099, 'eval_samples_per_second': 542.552, 'eval_steps_per_second': 67.826}
Predict Using Model


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5013/5013 [01:14<00:00, 67.56it/s]


Test Metrics:  {'test_loss': 1.7693740129470825, 'test_precision': 0.530257119061542, 'test_recall': 0.5575450429468428, 'test_f1': 0.5435588171183608, 'test_accuracy': 0.76395880283915, 'test_runtime': 74.2271, 'test_samples_per_second': 540.234, 'test_steps_per_second': 67.536}


# Aggregate BiLSTM models. Since the Skweak Aggregation HMM requires doc objects, we need to convert the tokens in our prediction csvs into docs. 



In [6]:
import spacy
from tqdm import tqdm
from spacy.tokens import Span
from skweak import utils, aggregation
nlp = spacy.blank("xx")
def apply_annotations_from_csv(docs, base_filepath,model_names):
    dataframe0 = pd.read_csv(base_filepath+ model_names[0]+'.csv')
    dataframe1 = pd.read_csv(base_filepath+model_names[1]+'.csv')
    dataframe2 = pd.read_csv(base_filepath+model_names[2]+'.csv')
    dataframe3 = pd.read_csv(base_filepath+model_names[3]+'.csv')
    prediction0 = dataframe0['predictions'].tolist()
    prediction1 = dataframe1['predictions'].tolist()
    prediction2 = dataframe2['predictions'].tolist()
    prediction3 = dataframe3['predictions'].tolist()
    new_docs = []
    
    
    for idx, doc in tqdm(enumerate(docs), total=len(docs), desc="Processing Docs"):
        for model in model_names: 
            if model not in doc.spans:
                doc.spans[model] = []
        # Assuming the tokens are a string representation of a list       
        for token, label in zip(doc, eval(prediction0[idx])):
            
            label = label.replace("-", "")            
            # Create a skweak annotation for each token. Adjust the span as needed.
            span = Span(doc, token.i, token.i + 1, label=label)
            doc.spans[model_names[0]].append(span)
        for token, label in zip(doc, eval(prediction1[idx])):
            label = label.replace("-", "")
            # Create a skweak annotation for each token. Adjust the span as needed.
            span = Span(doc, token.i, token.i + 1, label=label)
            doc.spans[model_names[1]].append(span)
        for token, label in zip(doc, eval(prediction2[idx])): 
            label = label.replace("-", "")
            # Create a skweak annotation for each token. Adjust the span as needed.
            span = Span(doc, token.i, token.i + 1, label=label)
            doc.spans[model_names[2]].append(span)
        for token, label in zip(doc, eval(prediction3[idx])): 
            label = label.replace("-", "")
            # Create a skweak annotation for each token. Adjust the span as needed.
            span = Span(doc, token.i, token.i + 1, label=label)
            doc.spans[model_names[3]].append(span)
            
            
        new_docs.append(doc)
       
   
    return new_docs


In [7]:
def tokens_to_docs(tokens_list): 
    docs = []
    for sentence_tokens in tokens_list: 
        spaces = [True if token not in [',', '.'] else False for token in sentence_tokens[:-1]] + [False]
        doc = spacy.tokens.Doc(nlp.vocab, words = sentence_tokens, spaces=spaces)
        docs.append(doc)
    return docs

In [8]:
docs = tokens_to_docs(validation_ds['tokens'])

In [9]:
# Example usage
base_filepath  = "/home/javin/Coding/CSCI544/FinalProject/model_prediction_files/" 
dev_models = ["BiLSTM_CNN_dev", "BiLSTM_CNN_Attention_dev", "BiLSTM_CRF_dev", "BiLSTM_dev"]
annotated_docs = apply_annotations_from_csv(docs, base_filepath, dev_models)


Processing Docs: 100%|██████████| 40100/40100 [00:05<00:00, 7213.89it/s] 


In [10]:
idx_2_tags

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

Since Skweak hmm aggregation doesn't support prefix ner tags, we remove the - symbol from the tags 

In [11]:
updated_idx_2_tags = {k: v.replace("-", "") for k, v in idx_2_tags.items()}

In [12]:
tags_2_updated_idx = {v: k for k, v in updated_idx_2_tags.items()}

In [13]:
tags_2_updated_idx

{'O': 0,
 'BPER': 1,
 'IPER': 2,
 'BORG': 3,
 'IORG': 4,
 'BLOC': 5,
 'ILOC': 6,
 'BMISC': 7,
 'IMISC': 8}

In [14]:
from skweak import aggregation, utils

# Assuming docs is a list of SpaCy Docs with entities from different models
aggregator = aggregation.HMM("aggregated", list(updated_idx_2_tags.values()))
aggregated_docs = aggregator.fit_and_aggregate(annotated_docs)


Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed docume

         1 -694389.30140799             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         2 -668828.43638017  +25560.86502782


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         3 -668433.37349517    +395.06288499


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         4 -668390.02034052     +43.35315465


In [15]:
# utils.display_entities(aggregated_docs[0], "aggregated")
utils.display_entities(aggregated_docs[1], "aggregated")

In [19]:
sentence_samples, sentence_tags = convert_doc_to_lists(aggregated_docs)

In [20]:
sentence_samples[1]

['佐', '賀', '県']

In [21]:
sentence_tags[1]

['BLOC', 'ILOC', 'ILOC']

In [22]:
[tags_2_updated_idx[tag] for tag in sentence_tags[1]]

[5, 6, 6]

In [23]:
data = {"tokens":sentence_samples, "predictions": sentence_tags}
        
# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("skweak_hmm_output.csv", index=True)

In [24]:
sentence_tags_int = [[tags_2_updated_idx[tag] for tag in tag_list] for tag_list in sentence_tags]

In [25]:
metric = load_metric("seqeval")

In [26]:
true_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in validation_ds['ner_tags']]

In [27]:
predictions_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in sentence_tags_int]

In [28]:
predictions_labels_tags[1]

['B-LOC', 'I-LOC', 'I-LOC']

In [29]:
true_labels_tags[1]

['B-LOC', 'I-LOC', 'I-LOC']

In [30]:
results = metric.compute(predictions=predictions_labels_tags, references=true_labels_tags)

In [31]:
results

{'LOC': {'precision': 0.8559121710353522,
  'recall': 0.8453875687454602,
  'f1': 0.850617316175511,
  'number': 19274},
 'ORG': {'precision': 0.7752102148863282,
  'recall': 0.7738605981471118,
  'f1': 0.7745348185948098,
  'number': 16083},
 'PER': {'precision': 0.8799659305226014,
  'recall': 0.8736937481123528,
  'f1': 0.8768186226964112,
  'number': 16555},
 'overall_precision': 0.8384404898212657,
 'overall_recall': 0.8322545846817692,
 'overall_f1': 0.8353360853046665,
 'overall_accuracy': 0.9147208304725455}

# Test Aggregation on testing set

In [50]:
docs = tokens_to_docs(test_ds['tokens'])

In [51]:

base_filepath  = "/home/javin/Coding/CSCI544/FinalProject/model_prediction_files/" 
models = ["BiLSTM_CNN_test", "BiLSTM_CNN_Attention_test", "BiLSTM_CRF_test", "BiLSTM_test"]
annotated_docs = apply_annotations_from_csv(docs, base_filepath, models)
from skweak import aggregation, utils

# Assuming docs is a list of SpaCy Docs with entities from different models
aggregator = aggregation.HMM("aggregated", list(updated_idx_2_tags.values()))
aggregated_docs = aggregator.fit_and_aggregate(annotated_docs)

Processing Docs: 100%|██████████| 40100/40100 [00:05<00:00, 7061.86it/s] 


Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed docume

         1 -717809.12528166             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         2 -692187.86600776  +25621.25927390


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         3 -691745.69111780    +442.17488996


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         4 -691696.30944032     +49.38167748


In [52]:
sentence_samples, sentence_tags = convert_doc_to_lists(aggregated_docs)

In [53]:
sentence_tags_int = [[tags_2_updated_idx[tag] for tag in tag_list] for tag_list in sentence_tags]
predictions_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in sentence_tags_int]
true_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in test_ds['ner_tags']]
results = metric.compute(predictions=predictions_labels_tags, references=true_labels_tags)
results

{'LOC': {'precision': 0.8421161183871301,
  'recall': 0.8345426673479817,
  'f1': 0.8383122882660917,
  'number': 19570},
 'ORG': {'precision': 0.7526363362458572,
  'recall': 0.7389658028635664,
  'f1': 0.7457384243365078,
  'number': 16902},
 'PER': {'precision': 0.8708760943932278,
  'recall': 0.8730527784236224,
  'f1': 0.8719630779948332,
  'number': 17204},
 'overall_precision': 0.823540461341949,
 'overall_recall': 0.8167896266487816,
 'overall_f1': 0.8201511523495959,
 'overall_accuracy': 0.914015516687325}

# Test Aggregation on training set 

In [44]:
docs = tokens_to_docs(train_ds['tokens'])

In [45]:
len(docs)

80100

In [46]:
# Example usage
base_filepath  = "/home/javin/Coding/CSCI544/FinalProject/model_prediction_files/" 
models = ["BiLSTM_CNN_train", "BiLSTM_CNN_Attention_train", "BiLSTM_CRF_train", "BiLSTM_train"]
annotated_docs = apply_annotations_from_csv(docs, base_filepath, models)
from skweak import aggregation, utils

# Assuming docs is a list of SpaCy Docs with entities from different models
aggregator = aggregation.HMM("aggregated", list(updated_idx_2_tags.values()))
aggregated_docs = aggregator.fit_and_aggregate(annotated_docs)

Processing Docs: 100%|██████████| 80100/80100 [00:12<00:00, 6512.84it/s] 


Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed docume

         1 -1374591.24942578             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         2 -1325699.35905616  +48891.89036962


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         3 -1324982.96140443    +716.39765173


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Number of processed documents: 5000
Number of processed documents: 6000
Number of processed documents: 7000
Number of processed documents: 8000
Number of processed documents: 9000
Number of processed documents: 10000
Number of processed documents: 11000
Number of processed documents: 12000
Number of processed documents: 13000
Number of processed documents: 14000
Number of processed documents: 15000
Number of processed documents: 16000
Number of processed documents: 17000
Number of processed documents: 18000
Number of processed documents: 19000
Number of processed documents: 20000
Number of processed documents: 21000
Number of processed documents: 22000
Number of processed documents: 23000
Number of processed documents: 24000
Number of processed documents: 25000
Number of processed documents: 26000
Number of processed documents: 27000
Number of 

         4 -1324907.00847859     +75.95292583


In [47]:
sentence_samples, sentence_tags = convert_doc_to_lists(aggregated_docs)

In [49]:
sentence_tags_int = [[tags_2_updated_idx[tag] for tag in tag_list] for tag_list in sentence_tags]
predictions_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in sentence_tags_int]
true_labels_tags = [[idx_2_tags[tag] for tag in tag_list] for tag_list in train_ds['ner_tags']]
results = metric.compute(predictions=predictions_labels_tags, references=true_labels_tags)
results

{'LOC': {'precision': 0.8738757473355862,
  'recall': 0.8735805420575319,
  'f1': 0.8737281197614128,
  'number': 38483},
 'ORG': {'precision': 0.8092714041865061,
  'recall': 0.7916569018045465,
  'f1': 0.8003672496260865,
  'number': 34136},
 'PER': {'precision': 0.8983786580808372,
  'recall': 0.8950850661625709,
  'f1': 0.8967288378766142,
  'number': 34914},
 'overall_precision': 0.8616395840561093,
 'overall_recall': 0.8545562757479099,
 'overall_f1': 0.8580833123231644,
 'overall_accuracy': 0.9319564201549997}