# This code runs the huggingface baselines on our dataset 

In [24]:
import json
import torch
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
from pandas import read_parquet
from transformers import BertModel
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification,BertTokenizerFast, AutoTokenizer

In [23]:
data_train = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/raw.parquet")
data_dev = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/dev.parquet")
data_test = read_parquet(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/test.parquet")

In [39]:
data_train

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[#, #, ユ, リ, ウ, ス, ・, ベ, ー, リ, ッ, ク, #, 1, 9, ...","[0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[PER: ユ リ ウ ス ・ ベ ー リ ッ ク]
1,"[#, ル, ノ, ー, 、, 日, 産, 自, 動, 車, に, 資, 本, 参, 加, 。]","[0, 3, 4, 4, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ル ノ ー, ORG: 日 産 自 動 車]"
2,"[ソ, マ, リ, ラ, ン, ド, （, 事, 実, 上, 独, 立, し, た, 地, ...","[5, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[LOC: ソ マ リ ラ ン ド]
3,"[R, E, D, I, R, E, C, T, #, ス, レ, イ, マ, ニ, エ, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ス レ イ マ ニ エ ・ モ, ORG: ス ク]"
4,"[#, ', ', E, l, e, c, t, r, i, c, #, C, o, u, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[PER: S t e v e, PER: R e i c h]"
...,...,...,...,...
9995,"[', '', Mário, Jardel, '', ']","[0, 0, 1, 2, 0, 0]","[es, es, es, es, es, es]",[PER: Mário Jardel]
9996,"[José, Joaquín, Araiza]","[1, 2, 2]","[es, es, es]",[PER: José Joaquín Araiza]
9997,"[', '', Poestenkill, '', ', ;, o]","[0, 0, 5, 0, 0, 0, 0]","[es, es, es, es, es, es, es]",[LOC: Poestenkill]
9998,"[REDIRECCIÓN, 1., Lig, 1976-77]","[0, 5, 6, 6]","[es, es, es, es]",[LOC: 1. Lig 1976-77]


In [25]:
train_ds = Dataset.from_pandas(data_train)
validation_ds = Dataset.from_pandas(data_dev)
test_ds = Dataset.from_pandas(data_test)

ds = DatasetDict()

ds['train'] = train_ds
ds['validation'] = validation_ds
ds['test'] = test_ds

In [8]:
# 获得tag文件
with open(r"/home/javin/Coding/CSCI544/FinalProject/data/merge/tags_2_idx.json","r") as file:
    tags_2_idx = json.load(file)
file.close()


In [9]:
idx_2_tags = {tags_2_idx[tag]:tag for tag in tags_2_idx.keys()}

In [10]:
idx_2_tags

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [13]:

model_name = "Babelscape/wikineural-multilingual-ner"

wikineural_tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",truncation = True,  is_split_into_words=True)
wikineural_model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(idx_2_tags), label2id = tags_2_idx, id2label = idx_2_tags)

In [40]:
tokenized = wikineural_tokenizer(ds['train'][9996]['tokens'], truncation=True, is_split_into_words=True)

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
wikineural_input_token = torch.tensor([tokenized['input_ids']]).to(device)
wikineural_input_token

tensor([[ 101,  108,  122,  130,  130,  126, 3642, 1881, 3692, 4473, 8239, 2072,
         5751, 5785,  102]], device='cuda:0')

In [42]:
wikineural_model= wikineural_model.to(device)
model_prediction = wikineural_model(wikineural_input_token)
model_logits = model_prediction.logits 
model_predictions = torch.argmax(model_logits, dim=-1)

In [43]:
model_predictions[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0], device='cuda:0')

In [44]:
def reverse_map_predictions(original_tokens, tokenized_inputs, predictions):
    # Assuming predictions are aligned with the tokenized input (subtokens)
    word_ids = tokenized_inputs.word_ids()  # Get word IDs for all tokens in the batch
    reversed_predictions = []
    current_word_id = None
    current_word_predictions = []
    
    for word_id, prediction in zip(word_ids, predictions):
        if word_id is None:
            # Skipping special tokens like [CLS], [SEP], etc.
            continue
        
        if word_id != current_word_id:
            # Encountering a new word, decide the label for the previous word
            if current_word_predictions:
                # You can implement different strategies here
                # For simplicity, taking the first prediction for the word
                reversed_predictions.append(current_word_predictions[0])
            current_word_predictions = [prediction]
            current_word_id = word_id
        else:
            # Accumulating predictions for subtokens of the same word
            current_word_predictions.append(prediction)
    
    # Don't forget to add the prediction for the last word
    if current_word_predictions:
        reversed_predictions.append(current_word_predictions[0])

    return [original_tokens, reversed_predictions]

In [45]:
reverse_map_predictions(ds['train'][9996]['tokens'], tokenized, model_predictions[0].tolist())

[['#', '1', '9', '9', '5', '年', '、', '廣', '木', '隆', '一', '監', '督'],
 [0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]]

In [46]:
ds['train'][9996]['ner_tags']

[0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]