imports

In [1]:
import pandas as pd
import pyarrow


# Reading in data

silver data

In [2]:
# read
silver_test = pd.read_parquet('test-00000-of-00001.parquet')
silver_labels = silver_test["ner_tags"][:200]
silver_labels.drop(silver_labels.index[68], inplace=True)

silver_labels.head()

0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, ...
1           [0, 1, 2, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0]
2                                            [3, 4, 4]
3                       [0, 5, 6, 6, 0, 0, 0, 0, 0, 0]
4                                [1, 2, 2, 0, 3, 4, 0]
Name: ner_tags, dtype: object

gold data

In [3]:
gold_eng = pd.read_csv("gold_eng.csv", sep=";")
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
gold_eng.drop(columns=columns_to_drop, inplace=True)
# Drop the last row explicitly by index
gold_eng.drop(gold_eng.index[-1], inplace=True)
gold_eng.drop(gold_eng.index[-1], inplace=True)
gold_eng


Unnamed: 0,Labels,Sentences
0,"[0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0]","[ ""Shortly"", ""afterward"", "","", ""an"", ""encourag..."
1,"[0,1,2,0,1,2,0,0,0,0,0,0,0,0]","[ "":"", ""Kanye"", ""West"", ""featuring"", ""Jamie"", ..."
2,"[3,4,4]","[ ""Blacktown"", ""railway"", ""station"" ]"
3,"[0,0,0,0,0,0,1,0,0,0]","[ ""''"", ""Mycalesis"", ""perseus"", ""lalassis"", ""'..."
4,"[1,2,2,0,0,0,0]","[ ""Jonny"", ""Lee"", ""Miller"", ""-"", ""Eli"", ""Stone..."
...,...,...
194,"[1,2,0,0,0]","[ ""Wesley"", ""Pruden"", ""("", ""2013"", "")"" ]"
195,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3...","[ ""Previews"", ""are"", ""scheduled"", ""to"", ""begin..."
196,"[5,0,5,0,5]","[ ""China"", "","", ""Indonesia"", "","", ""Vietnam"" ]"
197,"[3,4,4,4,4,4]","[ ""Emmett"", ""/"", ""Furla"", ""/"", ""Oasis"", ""Films"" ]"


converting to list of list

In [4]:
gold_words = gold_eng["Sentences"].values.tolist()
gold_labels = gold_eng["Labels"].values.tolist()


In [5]:
silver_labels_list=[(string_list).tolist() for string_list in silver_labels]
gold_labels_list=[eval(string_list) for string_list in gold_labels]
gold_word_list = [eval(string_list) for string_list in gold_words]


# preprocessing the data

In [26]:
from transformers import AutoTokenizer, Trainer

model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [27]:
import pandas as pd
label_all_tokens = True

In [28]:
def tokenize_and_align_labels(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)

    aligned_labels = []
    for i, label in enumerate(pd.Series(tags)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs.data

In [29]:
tokenized_gold_data = tokenize_and_align_labels(gold_word_list, gold_labels_list, tokenizer)
tokenized_silver_data = tokenize_and_align_labels(gold_word_list, silver_labels_list, tokenizer)

In [30]:
#tokenized_gold_data

In [31]:
from datasets import Dataset

In [32]:
gold_dataset = Dataset.from_dict({
    'id': range(len(tokenized_gold_data['input_ids'])),
    'input_ids': tokenized_gold_data['input_ids'],
    'attention_mask': tokenized_gold_data['attention_mask'],
    'labels': tokenized_gold_data['labels']
})


silver_dataset = Dataset.from_dict({
    'id': range(len(tokenized_silver_data['input_ids'])),
    'input_ids': tokenized_silver_data['input_ids'],
    'attention_mask': tokenized_silver_data['attention_mask'],
    'labels': tokenized_silver_data['labels']
})


In [33]:
import pickle

# Save gold_dataset
with open('mbert_eng_gold_gold_dataset.pickle', 'wb') as f:
    pickle.dump(gold_dataset, f)

# Save silver_dataset
with open('mbert_eng_gold_silver_dataset.pickle', 'wb') as f:
    pickle.dump(silver_dataset, f)



# Quality checking lengths of the annotations

In [34]:
# Check if lengths are equal and aligned
lengths_equal = all(len(gold_labels) == len(sentences) for gold_labels, sentences in zip(gold_labels_list, gold_word_list))

print("Are lengths equal and aligned:", lengths_equal)

Are lengths equal and aligned: True


In [35]:
silver_labels_list = [(string_list).tolist() for string_list in silver_labels]
gold_labels_list = [eval(string_list) for string_list in gold_labels]
gold_word_list = [eval(string_list) for string_list in gold_words]

# Check if all lists have the same length at each index
for i in range(min(len(silver_labels_list), len(gold_labels_list), len(gold_word_list))):
    silver_len = len(silver_labels_list[i])
    gold_len = len(gold_labels_list[i])
    word_len = len(gold_word_list[i])
    
    if silver_len != gold_len or silver_len != word_len:
        print(f"At index {i}: Silver labels length = {silver_len}, Gold labels length = {gold_len}, Gold words length = {word_len}")
        


# using test data on saved model

In [36]:
#gold_dataset

In [37]:
from transformers import AutoTokenizer, Trainer,AutoModelForTokenClassification

In [38]:
loaded_model = AutoModelForTokenClassification.from_pretrained("eng_mbert")

In [39]:
#loaded_model

import pickle

with open('gold_dataset.pickle','rb') as f:
    eng_gold = pickle.load(f)

In [42]:
trainer = Trainer(model = loaded_model)

[W socket.cpp:464] [c10d] The server socket has failed to bind to [::]:49487 (errno: 98 - Address already in use).
[W socket.cpp:464] [c10d] The server socket has failed to bind to 0.0.0.0:49487 (errno: 98 - Address already in use).
[E socket.cpp:500] [c10d] The server socket has failed to listen on any local network address.


DistNetworkError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:49487 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:49487 (errno: 98 - Address already in use).

In [None]:
trainer.evaluate(silver_dataset)

In [None]:
trainer.predict(silver_dataset)

In [None]:
loaded_model.to("cuda:0")

In [None]:
import numpy as np


In [None]:
id2label = {0: '0',
            1: 'B-PER', 
            2: 'I-PER',
            3: 'B-ORG',
            4: 'I-ORG',
            5: 'B-LOC',
            6: 'I-LOC'
           }

In [None]:
label2id = {label: id for id, label in id2label.items()}

In [None]:
label_names = list(label2id.keys())

In [None]:
from datasets import load_metric

In [None]:
metric = load_metric("seqeval")

In [None]:
predictions, labels, metrics = trainer.predict(silver_dataset)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
for i in range(len(true_predictions)):
    print("Example", i+1)
    print("Predicted:", true_predictions[i])
    print("Real:", true_labels[i])
    print()