### Extracting POS tags (Experimenting)

In [1]:
import polyglot
from polyglot.downloader import downloader

print(downloader.supported_languages_table("pos2", 3))
downloader.download("pos2.mk")

from polyglot.text import Text

def get_pos_tags_polyglot(text, lang='mk'):
    poly_text = Text(text, hint_language_code=lang)
    return ' '.join([f'{word}_{tag}' for word, tag in zip(poly_text.words, poly_text.pos_tags)])


  1. Italian                    2. French                     3. Spanish; Castilian       
  4. Bulgarian                  5. Slovene                    6. Irish                    
  7. Finnish                    8. Dutch                      9. Swedish                  
 10. Danish                    11. Portuguese                12. English                  
 13. German                    14. Indonesian                15. Czech                    
 16. Hungarian                
[polyglot_data] Error loading pos2.mk: Package 'pos2.mk' not found in
[polyglot_data]     index


In [2]:
import stanza

# Dictionary to hold the Stanza pipeline objects for each language
nlp_pipelines = {}

def get_stanza_pipeline(lang):
    if lang not in nlp_pipelines:
        # Download the model if not already downloaded
        stanza.download(lang)
        # Create a Stanza pipeline for the specified language
        nlp_pipelines[lang] = stanza.Pipeline(lang, processors='tokenize,pos')
    return nlp_pipelines[lang]

def get_pos_tags_stanza(text, lang='en'):
    # Retrieve the appropriate NLP pipeline based on language
    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)
    # Generate POS tags for each token in the text
    
    pos_tags = ' '.join([f'{word.text}_{word.upos}' for sent in doc.sentences for word in sent.words])
    return pos_tags

# Example usage
english_text = "This is a sample sentence."
print("English:", get_pos_tags_stanza(english_text, 'en'))

arabic_text = "هذا نص عربي للتحليل."
print("Arabic:", get_pos_tags_stanza(arabic_text, 'ar'))

bulgarian_text = "Това е примерен текст за анализ."
print("Bulgarian:", get_pos_tags_stanza(bulgarian_text, 'bg'))

KeyboardInterrupt: 

In [None]:
import json
import spacy
from spacy.lang.en import English
import re

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load JSON data from a file
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# Save updated JSON data to a file
def save_data(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

def preprocess_text(text):
    # Decode HTML entities
    # text = html.unescape(text)
    
    # Replace \\n and \n (new lines) with a space
    # print(text)
    # print()
    text = re.sub(r'(\\n)+', ' ', text)
    
    # print(text)
    # print()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove any remaining backslashes
    text = text.replace("\\", " ")
    
    # print(text)
    # print()
    
    # Remove digits - You can comment this line if you want to keep numbers
    # text = re.sub(r'\d+', '', text)
    
    # Optionally, remove punctuation
    # text = text.translate(str.maketrans('', '', punctuation))
    
    # Convert text to lowercase
    # text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # print(text)
    # print()
    
    return text


# Get POS tags using spaCy
def get_pos_tags(text):
    doc = nlp(text)
    return ' '.join([f'{token}_{token.pos_}'  for token in doc])
    # return [token.pos_ for token in doc]

def get_pos_tags(text, lang='en'):
    if lang in ['en', 'ar', 'bg']:  # supported by Stanza
        return get_pos_tags_stanza(text, lang)
    elif lang == 'mk':  # use Polyglot for Macedonian
        return get_pos_tags_polyglot(text, lang)
    else:
        raise NotImplementedError(f"POS tagging not implemented for language: {lang}")


# Main function to process the data
def process_data(filename, output_filename, lang='en'):
    data = load_data(filename)
    for item in data:
        # Preprocess the text
        preprocessed_text = preprocess_text(item['text'])
        # Get POS tags
        pos_tags = get_pos_tags(preprocessed_text, lang)
        # Update the item with POS tags
        item['text'] = pos_tags
        # item['processed_text'] = preprocessed_text
        
    # Save updated data
    save_data(data, output_filename)
    return data

# Example of usage
if __name__ == "__main__":
    filename = 'semeval2024_dev_release/subtask1/validation.json'
    output_file_path = 'semeval2024_dev_release/subtask1/pos_validation.json'
    
    train_input = './semeval2024_dev_release/subtask1/train.json'
    train_output = './semeval2024_dev_release/subtask1/POS/train.json'
    
    val_input = './semeval2024_dev_release/subtask1/validation.json'
    val_output = './semeval2024_dev_release/subtask1/POS/validation.json'
    
    test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
    test_en_output = './test_data/english/POS/en_subtask1_test_unlabeled.json'
    
    test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
    test_md_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_md.json'
    
    test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
    test_ar_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_ar.json'
    
    test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
    test_bg_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_bg.json'
    
    # print(json.dumps(updated_data, indent=4))
    
    updated_data = process_data(train_input, train_output, lang='en')
    updated_data = process_data(val_input, val_output, lang='en')
    updated_data = process_data(test_en_input, test_en_output, lang='en')
    updated_data = process_data(test_bg_input, test_bg_output, lang='bg')
    updated_data = process_data(test_ar_input, test_ar_output, lang='ar')


In [None]:
import stanza

class NLPModelManager:
    def __init__(self):
        self.nlp_pipelines = {}

    def get_pipeline(self, lang='en'):
        if lang not in self.nlp_pipelines:
            # Download the model if not already downloaded
            stanza.download(lang)
            # Create a Stanza pipeline for the specified language
            self.nlp_pipelines[lang] = stanza.Pipeline(lang, processors='tokenize,pos', use_gpu=True)
        return self.nlp_pipelines[lang]

# Create a global instance of the NLPModelManager
nlp_manager = NLPModelManager()

def get_pos_tags_stanza(text, lang='en'):
    # Retrieve the appropriate NLP pipeline based on language from the manager
    nlp = nlp_manager.get_pipeline(lang)
    doc = nlp(text)
    # Generate POS tags for each token in the text
    pos_tags = ' '.join([f'{word.text}_{word.upos}' for sent in doc.sentences for word in sent.words])
    return pos_tags

# Example usage
english_text = "This is a sample sentence."
print("English:", get_pos_tags_stanza(english_text, 'en'))

arabic_text = "هذا نص عربي للتحليل."
print("Arabic:", get_pos_tags_stanza(arabic_text, 'ar'))

bulgarian_text = "Това е примерен текст за анализ."
print("Bulgarian:", get_pos_tags_stanza(bulgarian_text, 'bg'))


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_text_item(item, lang='en'):
    preprocessed_text = preprocess_text(item['text'])
    pos_tags = get_pos_tags(preprocessed_text, lang)
    item['text'] = pos_tags
    return item

def process_data_concurrently(data, lang='en'):
    processed_items = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_item = {executor.submit(process_text_item, item, lang): item for item in data}
        for future in as_completed(future_to_item):
            processed_items.append(future.result())
    return processed_items

def batch_process(data, batch_size=10, lang='en'):
    nlp = get_stanza_pipeline(lang)
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        texts = [preprocess_text(item['text']) for item in batch]
        docs = nlp(texts)
        for doc, item in zip(docs, batch):
            pos_tags = ' '.join([f'{word.text}_{word.upos}' for sent in doc.sentences for word in sent.words])
            item['text'] = pos_tags
    return data

# import cProfile
# import pstats
# 
# with cProfile.Profile() as pr:
#     process_data(test_ar_input, test_ar_output, lang='ar')
# 
# stats = pstats.Stats(pr)
# stats.sort_stats(pstats.SortKey.TIME)
# stats.print_stats()

# Usage
if __name__ == "__main__":
    filename = 'semeval2024_dev_release/subtask1/validation.json'
    output_file_path = 'semeval2024_dev_release/subtask1/pos_validation.json'

    train_input = './semeval2024_dev_release/subtask1/train.json'
    train_output = './semeval2024_dev_release/subtask1/POS/train.json'

    val_input = './semeval2024_dev_release/subtask1/validation.json'
    val_output = './semeval2024_dev_release/subtask1/POS/validation.json'

    test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
    test_en_output = './test_data/english/POS/en_subtask1_test_unlabeled.json'

    test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
    test_md_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_md.json'

    test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
    test_ar_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_ar.json'

    test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
    test_bg_output = './test_labels_ar_bg_md_version2/POS/test_subtask1_bg.json'

    # print(json.dumps(updated_data, indent=4))

    # updated_data = process_data(train_input, train_output, lang='en')
    # updated_data = process_data(val_input, val_output, lang='en')
    updated_data = process_data(test_en_input, test_en_output, lang='en')
    updated_data = process_data(test_bg_input, test_bg_output, lang='bg')
    updated_data = process_data(test_ar_input, test_ar_output, lang='ar')

### Extracting NER embeddings

In [None]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import pickle

import torchvision.models as models
from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner", output_hidden_states=True)

# Encode the input text and obtain the model outputs
text = "Insert your text here."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
hidden_states = outputs.hidden_states
token_embeddings = hidden_states[-1][0] 


# Strategy 2: Use the [CLS] token embedding
cls_embedding = token_embeddings[0]


In [None]:
cls_embedding.shape

In [None]:
device = get_device()

In [None]:
def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    # for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
    #     data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)
    
    return data_df

In [None]:
def extract_text_features(file_path, tokenizer, text_model, output_file_path, subtask=1, device=torch.device("mps")):
    features_dict = {}
    
    if subtask == 1:
        data = process_json(file_path, techniques_to_level_1, hierarchy_1)
    else:
        data = process_json(file_path, techniques_to_level_2a, hierarchy_subtask_2a)
    
    step = 0
    
    for id, text in zip(data['id'], data['cleaned_text']):
        # print(data['text'], data['cleaned_text'])
        # break
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True, 
                                  max_length=128, truncation=True, padding='max_length').to(device)
        
        # input_ids = encoded_input['input_ids'].to('cpu')
        # decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
        # print(decoded_text)
        # return
        with torch.no_grad():
            embeddings = text_model(**encoded_input)
        features_dict[id] = embeddings.hidden_states[-1][:, 0, :].detach().cpu().squeeze().numpy()
        
        # print(features_dict[id].shape, features_dict[id].dtype)
        
        step += 1
        
        if step % 100 == 0:
            print(f'completed {step} steps')
            
    with open(f'{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)
    
    print(f"Features extracted and stored in {output_file_path}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER", output_hidden_states=True)

model.to(device)

dir = './TextFeatures/subtask1a/BERT-NER/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

# extract_text_features(train_input, tokenizer, model, train_output)
# extract_text_features(val_input, tokenizer, model, val_output)
# 
# extract_text_features(test_en_input, tokenizer, model, test_en_output)
# extract_text_features(test_md_input, tokenizer, model, test_md_output)
# extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
# extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner",
                                                        output_hidden_states=True)

model.to(device)

dir = './TextFeatures/subtask1a/multilingual-ner/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

# extract_text_features(train_input, tokenizer, model, train_output)
# extract_text_features(val_input, tokenizer, model, val_output)
# 
# extract_text_features(test_en_input, tokenizer, model, test_en_output)
# extract_text_features(test_md_input, tokenizer, model, test_md_output)
# extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
# extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner", 
                                                        output_hidden_states=True)
device = get_device()
model.to(device)

dir = './TextFeatures/subtask2a/multilingual-ner/'

train_input = './semeval2024_dev_release/subtask2a/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask2a/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask2a_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask2a_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask2a_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask2a_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

# extract_text_features(train_input, tokenizer, model, train_output)
# extract_text_features(val_input, tokenizer, model, val_output)

# extract_text_features(test_en_input, tokenizer, model, test_en_output)
# extract_text_features(test_md_input, tokenizer, model, test_md_output)
# extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
# extract_text_features(test_bg_input, tokenizer, model, test_bg_output)

extract_text_features(dev_en_input, tokenizer, model, dev_en_output)