### Extracting NER embeddings

In [None]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import pickle

import torchvision.models as models
from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner", output_hidden_states=True)

# Encode the input text and obtain the model outputs
text = "Insert your text here."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
hidden_states = outputs.hidden_states
token_embeddings = hidden_states[-1][0] 

cls_embedding = token_embeddings[0]


In [None]:
cls_embedding.shape

In [None]:
device = get_device()

In [None]:
def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    # for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
    #     data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)
    
    return data_df

In [None]:
def extract_text_features(file_path, tokenizer, text_model, output_file_path, subtask=1, device=torch.device("mps")):
    features_dict = {}
    
    if subtask == 1:
        data = process_json(file_path, techniques_to_level_1, hierarchy_1)
    else:
        data = process_json(file_path, techniques_to_level_2a, hierarchy_subtask_2a)
    
    step = 0
    
    for id, text in zip(data['id'], data['cleaned_text']):
        # print(data['text'], data['cleaned_text'])
        # break
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True, 
                                  max_length=128, truncation=True, padding='max_length').to(device)
        
        # input_ids = encoded_input['input_ids'].to('cpu')
        # decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
        # print(decoded_text)
        # return
        with torch.no_grad():
            embeddings = text_model(**encoded_input)
        features_dict[id] = embeddings.hidden_states[-1][:, 0, :].detach().cpu().squeeze().numpy()
        
        # print(features_dict[id].shape, features_dict[id].dtype)
        
        step += 1
        
        if step % 100 == 0:
            print(f'completed {step} steps')
            
    with open(f'{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)
    
    print(f"Features extracted and stored in {output_file_path}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER", output_hidden_states=True)

model.to(device)

dir = './TextFeatures/subtask1a/BERT-NER/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

extract_text_features(train_input, tokenizer, model, train_output)
extract_text_features(val_input, tokenizer, model, val_output)

extract_text_features(test_en_input, tokenizer, model, test_en_output)
extract_text_features(test_md_input, tokenizer, model, test_md_output)
extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner",
                                                        output_hidden_states=True)

model.to(device)

dir = './TextFeatures/subtask1a/multilingual-ner/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

extract_text_features(train_input, tokenizer, model, train_output)
extract_text_features(val_input, tokenizer, model, val_output)

extract_text_features(test_en_input, tokenizer, model, test_en_output)
extract_text_features(test_md_input, tokenizer, model, test_md_output)
extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner", 
                                                        output_hidden_states=True)
device = get_device()
model.to(device)

dir = './TextFeatures/subtask2a/multilingual-ner/'

train_input = './semeval2024_dev_release/subtask2a/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask2a/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask2a_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask2a_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask2a_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask2a_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

extract_text_features(train_input, tokenizer, model, train_output)
extract_text_features(val_input, tokenizer, model, val_output)

extract_text_features(test_en_input, tokenizer, model, test_en_output)
extract_text_features(test_md_input, tokenizer, model, test_md_output)
extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)