In [5]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import pickle

import torchvision.models as models
from utils.utils import *
from utils.label_decoding import *
from utils.HierarchicalLoss import *

In [None]:
device = get_device()

### ResNet-50 Image Feature Extraction

In [None]:
def process_folder(folder_path, model, transform):
    """Process all images in a folder and store their features in a dictionary"""
    features_dict = {}
    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)
        if os.path.isfile(image_path) and image_path.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif')):
            image = Image.open(image_path).convert('RGB')
    
            image = transform(image)
            # Add a batch dimension
            image = image.unsqueeze(0)
            image = image.to(device)

            with torch.no_grad():
                features = model(image)
            features = features.cpu().squeeze().squeeze().numpy()
            features_dict[image_name] = features
    
    return features_dict

def extract_image_features(folder_path, modelname='resnet50', output_file_path='features.pkl'):
  device = get_device()
  model = None

  if modelname == 'resnet50':

    # Initialize the model
    model = resnet50(weights=models.ResNet50_Weights.DEFAULT)
  
  model.eval()  # Set the model to evaluation mode

  model = torch.nn.Sequential(*(list(model.children())[:-1]))

  for param in model.parameters():
    param.requires_grad = False

  model.to(device)

  # Define a transform to preprocess the images
  transform = transforms.Compose([
                    transforms.Resize(256),
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                  ])
  

  features_dict = process_folder(folder_path, model, transform)

  with open(f'{output_file_path}', 'wb') as f:
    pickle.dump(features_dict, f)

  print(f"Features extracted and stored in {output_file_path}")


In [None]:
extract_image_features('./test_images/subtask1_2a/english', 
                       output_file_path='./ImageFeatures/english_test_images_features.pkl')

In [None]:
extract_image_features('./test_images/subtask1_2a/bulgarian', 
                       output_file_path='./ImageFeatures/bulgarian_test_images_features.pkl')

In [None]:
extract_image_features('./test_images/subtask1_2a/north_macedonian', 
                       output_file_path='./ImageFeatures/nm_test_images_features.pkl')

In [None]:
extract_image_features('./train_images', 
                       output_file_path='./ImageFeatures/train_images_features.pkl')

In [None]:
extract_image_features('./validation_images', 
                       output_file_path='./ImageFeatures/validation_images_features.pkl')

In [None]:
extract_image_features('./dev_images', 
                       output_file_path='./ImageFeatures/dev_images_features.pkl')

In [None]:
extract_image_features('./test_images_arabic/subtask2a', 
                       output_file_path='./ImageFeatures/ar_test_images_features.pkl')

### Extracting Textual Features

In [7]:
def extract_text_features(file_path, tokenizer, text_model, output_file_path, subtask=1):
    features_dict = {}
    
    if subtask == 1:
        data = process_json(file_path, techniques_to_level_1, hierarchy_1)
    else:
        data = process_json(file_path, techniques_to_level_2a, hierarchy_subtask_2a)
    
    step = 0
    
    for id, text in zip(data['id'], data['cleaned_text']):
        # print(data['text'], data['cleaned_text'])
        # break
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True, 
                                  max_length=128, truncation=True, padding='max_length').to(device)
        
        # input_ids = encoded_input['input_ids'].to('cpu')
        # decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
        # print(decoded_text)
        # return
        with torch.no_grad():
            embeddings = text_model(**encoded_input)
        features_dict[id] = embeddings.last_hidden_state[:, 0, :].detach().cpu().squeeze().numpy()
        
        step += 1
        
        if step % 100 == 0:
            print(f'completed {step} steps')
            
    with open(f'{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)
    
    print(f"Features extracted and stored in {output_file_path}")

### mBERT for subtask-1

In [8]:
from transformers import BertTokenizer, BertModel
device = get_device()

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
text_model = BertModel.from_pretrained("bert-base-multilingual-uncased").to(device)


dir = './TextFeatures/subtask1a/mBERT/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'


extract_text_features(train_input, tokenizer, text_model, train_output)
extract_text_features(val_input, tokenizer, text_model, val_output)

def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)
    
    return data_df

extract_text_features(test_en_input, tokenizer, text_model, test_en_output)
extract_text_features(test_md_input, tokenizer, text_model, test_md_output)
extract_text_features(test_ar_input, tokenizer, text_model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, text_model, test_bg_output)
extract_text_features(dev_en_input, tokenizer, text_model, dev_en_output)

Using MPS
completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in ./TextFeatures/subtask1a/mBERT/en_dev_text_features.pkl


### mBERT for subtask 2a

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
text_model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)


dir = './TextFeatures/subtask2a/mBERT/'

train_input = './semeval2024_dev_release/subtask2a/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask2a/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask2a_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask2a_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask2a_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask2a_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'


extract_text_features(train_input, tokenizer, text_model, train_output, subtask=2)
extract_text_features(val_input, tokenizer, text_model, val_output, subtask=2)


def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
        data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)
    
    return data_df

extract_text_features(test_en_input, tokenizer, text_model, test_en_output, subtask=2)
extract_text_features(test_md_input, tokenizer, text_model, test_md_output, subtask=2)
extract_text_features(test_ar_input, tokenizer, text_model, test_ar_output, subtask=2)
extract_text_features(test_bg_input, tokenizer, text_model, test_bg_output, subtask=2)

### XLM BERT Features for subtask 1

In [9]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

model_name = 'xlm-roberta-large'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaModel.from_pretrained(model_name).to(device)

dir = './TextFeatures/subtask1a/XLM-RoBERTa/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

dev_en_input = './semeval2024_dev_release/subtask1/dev_unlabeled.json'
dev_en_output = dir + 'en_dev_text_features.pkl'

extract_text_features(train_input, tokenizer, model, train_output)
extract_text_features(val_input, tokenizer, model, val_output)

def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
        data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)

    return data_df


extract_text_features(test_en_input, tokenizer, model, test_en_output)
extract_text_features(test_md_input, tokenizer, model, test_md_output)
extract_text_features(test_ar_input, tokenizer, model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, model, test_bg_output)
extract_text_features(dev_en_input, tokenizer, model, dev_en_output)

completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in ./TextFeatures/subtask1a/XLM-RoBERTa/en_dev_text_features.pkl


In [14]:
import torch
from transformers import XLNetModel, XLNetTokenizer

# Initialize tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel.from_pretrained('xlnet-large-cased')

# Encode some text
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")

# Forward pass, get hidden states
outputs = model(**inputs)

# The last hidden-state is the first element of the outputs tuple
last_hidden_states = outputs.last_hidden_state

# You can now use last_hidden_states for various downstream tasks
# It is a tensor of shape [batch_size, sequence_length, hidden_size]
print(last_hidden_states.shape)


  return self.fget.__get__(instance, owner)()


torch.Size([1, 9, 1024])


In [None]:
def extract_text_features(file_path, tokenizer, text_model, output_file_path, subtask=1):
    features_dict = {}
    
    if subtask == 1:
        data = process_json(file_path, techniques_to_level_1, hierarchy_1)
    else:
        data = process_json(file_path, techniques_to_level_2a, hierarchy_subtask_2a)
    
    step = 0
    
    for id, text in zip(data['id'], data['cleaned_text']):
        # print(data['text'], data['cleaned_text'])
        # break
        inputs = tokenizer(text, return_tensors='pt', add_special_tokens=True, 
                                  max_length=128, truncation=True, padding='max_length').to(device)
        
        # input_ids = encoded_input['input_ids'].to('cpu')
        # decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
        # print(decoded_text)
        # return
        with torch.no_grad():
            embeddings = text_model(**inputs)
            
        last_hidden_states = embeddings.last_hidden_state
            
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_pooled_embeddings = sum_embeddings / sum_mask
        features_dict[id] = mean_pooled_embeddings.detach().cpu().squeeze().numpy()
        
        step += 1
        
        if step % 100 == 0:
            print(f'completed {step} steps')
            
    with open(f'{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)
    
    print(f"Features extracted and stored in {output_file_path}")

In [19]:
import torch
from transformers import XLNetModel, XLNetTokenizer

device = get_device()
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
text_model = XLNetModel.from_pretrained('xlnet-large-cased').to(device)


dir = './TextFeatures/subtask1a/XLNet/'

train_input = './semeval2024_dev_release/subtask1/train.json'
train_output = dir + 'train_text_features.pkl'

val_input = './semeval2024_dev_release/subtask1/validation.json'
val_output = dir + 'validation_text_features.pkl'

test_en_input = './test_data/english/en_subtask1_test_unlabeled.json'
test_en_output = dir + 'en_test_text_features.pkl'

test_md_input = './test_labels_ar_bg_md_version2/test_subtask1_md.json'
test_md_output = dir + 'md_test_text_features.pkl'

test_ar_input = './test_labels_ar_bg_md_version2/test_subtask1_ar.json'
test_ar_output = dir + 'ar_test_text_features.pkl'

test_bg_input = './test_labels_ar_bg_md_version2/test_subtask1_bg.json'
test_bg_output = dir + 'bg_test_text_features.pkl'

extract_text_features(train_input, tokenizer, text_model, train_output)
extract_text_features(val_input, tokenizer, text_model, val_output)

def process_json(file_path, techniques_to_level, hierarchy):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
        data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)
    
    return data_df
# extract_text_features(test_en_input, tokenizer, text_model, test_en_output)
extract_text_features(test_md_input, tokenizer, text_model, test_md_output)
extract_text_features(test_ar_input, tokenizer, text_model, test_ar_output)
extract_text_features(test_bg_input, tokenizer, text_model, test_bg_output)

Using MPS
completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
completed 1100 steps
completed 1200 steps
completed 1300 steps
completed 1400 steps
completed 1500 steps
completed 1600 steps
completed 1700 steps
completed 1800 steps
completed 1900 steps
completed 2000 steps
completed 2100 steps
completed 2200 steps
completed 2300 steps
completed 2400 steps
completed 2500 steps
completed 2600 steps
completed 2700 steps
completed 2800 steps
completed 2900 steps
completed 3000 steps
completed 3100 steps
completed 3200 steps
completed 3300 steps
completed 3400 steps
completed 3500 steps
completed 3600 steps
completed 3700 steps
completed 3800 steps
completed 3900 steps
completed 4000 steps
completed 4100 steps
completed 4200 steps
completed 4300 steps
completed 4400 steps
completed 4500 steps
completed 4600 steps
completed 4700 steps
completed 48

### Extracting CLIP+ViT

In [None]:
import torch
import clip
from PIL import Image

# Load the model
device = torch.device('mps')
model, preprocess = clip.load('ViT-B/32', device=device)


In [None]:
# Load an image


image_path = 'test_images/subtask1_2a/english/prop_meme_2.png'
image = Image.open(image_path)

# Preprocess the image
image = preprocess(image).unsqueeze(0).to(device)

with torch.no_grad():
    # Encode image using the CLIP model
    image_features = model.encode_image(image)

    # Optionally, you might want to normalize the features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

print(image_features.squeeze().shape)  


In [None]:
def extract_clip_vit_features(folder_path, output_file_path):
    device = torch.device('mps')
    model, preprocess = clip.load('ViT-B/32', device=device)
    features_dict = {}
    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)
        if os.path.isfile(image_path) and image_path.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif')):
            image = Image.open(image_path)

            # Preprocess the image
            image = preprocess(image).unsqueeze(0).to(device)
            
            with torch.no_grad():
                # Encode image using the CLIP model
                image_features = model.encode_image(image)
            
                # Optionally, you might want to normalize the features
                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
                features_dict[image_name] = image_features.cpu().squeeze().numpy()
    
    with open(f'{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)

    print(f"Features extracted and stored in {output_file_path}")

In [None]:
extract_clip_vit_features('./test_images/subtask1_2a/english', './ImageFeatures/CLIP-ViT/english_test_images_features.pkl')

In [None]:
extract_clip_vit_features('./test_images/subtask1_2a/bulgarian',
                          './ImageFeatures/CLIP-ViT/bulgarian_test_images_features.pkl')

In [None]:
extract_clip_vit_features('./test_images/subtask1_2a/north_macedonian',
                          './ImageFeatures/CLIP-ViT/nm_test_images_features.pkl')

In [None]:
extract_clip_vit_features('./test_images_arabic/subtask2a',
                          './ImageFeatures/CLIP-ViT/ar_test_images_features.pkl')

In [None]:
extract_clip_vit_features('./train_images', 
                          './ImageFeatures/CLIP-ViT/train_images_features.pkl')

In [None]:
extract_clip_vit_features('./validation_images',
                          './ImageFeatures/CLIP-ViT/validation_images_features.pkl')

### Extracting features from OpenAI's 3rd generation embedding models

In [10]:
from openai import OpenAI
client = OpenAI(api_key='YourKEY')
model="text-embedding-3-small"
emb = client.embeddings.create(input = ['Hello Every One'], model=model).data[0].embedding

In [11]:
def process_json(file_path):
    data_df = pd.read_json(file_path)
    data_df['cleaned_text'] = data_df['text'].apply(replace_newlines_with_fullstop)
    if 'link' in data_df.columns:
        data_df.drop(columns=['link'], inplace=True)

    for level in ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']:
        data_df[level] = pd.Series([[] for _ in range(len(data_df))], index=data_df.index)
    
    return data_df

In [12]:
import os

def extract_openai_features(file_path, model, output_file_path, task):
    directory_path = f'TextFeatures/{task}/{model}' 
    
    os.makedirs(directory_path, exist_ok=True)
    client = OpenAI(api_key='YOURKEY')
    features_dict = {}
    
    data = process_json(file_path)
    
    step = 0
    
    for id, text in zip(data['id'], data['cleaned_text']):
        
        try:
            features_dict[id] = np.array(client.embeddings.create(input = [text], model=model).data[0].embedding, dtype=np.float32)
        except:
            print(f'exception for id: {id} and text : {text}')
            features_dict[id] = np.zeros(1536, dtype=np.float32)
        
        step += 1
        
        if step % 100 == 0:
            print(f'completed {step} steps')
            
    with open(f'{directory_path}/{output_file_path}', 'wb') as f:
        pickle.dump(features_dict, f)
    
    print(f"Features extracted and stored in {output_file_path}")
    
    return features_dict

#### text-embedding-3-large for subtask-1

In [0]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/validation.json', 
                            model='text-embedding-3-large',
                            output_file_path='validation_text_features.pkl', 
                            task='subtask1a')

In [None]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/train.json', 
                            model='text-embedding-3-large',
                            output_file_path='train_text_features.pkl', 
                            task='subtask1a')

In [None]:
d = extract_openai_features(file_path='./test_data/english/en_subtask1_test_unlabeled.json', 
                            model='text-embedding-3-large',
                            output_file_path='en_test_text_features.pkl',
                            task='subtask1a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_bg.json', 
                            model='text-embedding-3-large',
                            output_file_path='bg_test_text_features.pkl', 
                            task='subtask1a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_md.json', 
                            model='text-embedding-3-large',
                            output_file_path='md_test_text_features.pkl', 
                            task='subtask1a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_ar.json', 
                            model='text-embedding-3-large',
                            output_file_path='ar_test_text_features.pkl', 
                            task='subtask1a')

In [13]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/dev_unlabeled.json',
                            model='text-embedding-3-large',
                            output_file_path='en_dev_text_features.pkl', 
                            task='subtask1a')

completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in en_dev_text_features.pkl


In [14]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/dev_unlabeled.json',
                            model='text-embedding-3-small',
                            output_file_path='en_dev_text_features.pkl', 
                            task='subtask1a')

completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in en_dev_text_features.pkl


#### text-embedding-3-small for subtask-1

In [0]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/validation.json', 
                            model='text-embedding-3-small',
                            output_file_path='validation_text_features.pkl', 
                            task='subtask1a')

In [0]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/train.json', 
                            model='text-embedding-3-small',
                            output_file_path='train_text_features.pkl', 
                            task='subtask1a')

In [0]:
d = extract_openai_features(file_path='./test_data/english/en_subtask1_test_unlabeled.json', 
                            model='text-embedding-3-small',
                            output_file_path='en_test_text_features.pkl',
                            task='subtask1a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_bg.json', 
                            model='text-embedding-3-small',
                            output_file_path='bg_test_text_features.pkl', 
                            task='subtask1a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_md.json', 
                            model='text-embedding-3-small',
                            output_file_path='md_test_text_features.pkl', 
                            task='subtask1a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask1_ar.json', 
                            model='text-embedding-3-small',
                            output_file_path='ar_test_text_features.pkl', 
                            task='subtask1a')

In [13]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/dev_unlabeled.json',
                            model='text-embedding-3-small',
                            output_file_path='en_dev_text_features.pkl', 
                            task='subtask1a')

completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in en_dev_text_features.pkl


In [14]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask1/dev_unlabeled.json',
                            model='text-embedding-3-small',
                            output_file_path='en_dev_text_features.pkl', 
                            task='subtask1a')

completed 100 steps
completed 200 steps
completed 300 steps
completed 400 steps
completed 500 steps
completed 600 steps
completed 700 steps
completed 800 steps
completed 900 steps
completed 1000 steps
Features extracted and stored in en_dev_text_features.pkl


### Extract text features using text-embeddings-3-small for subtask 2a

In [None]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask2a/train.json', 
                            model='text-embedding-3-small',
                            output_file_path='train_text_features.pkl', 
                            task='subtask2a')

In [None]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask2a/validation.json', 
                            model='text-embedding-3-small',
                            output_file_path='validation_text_features.pkl', 
                            task='subtask2a')

In [None]:
d = extract_openai_features(file_path='./test_data/english/en_subtask2a_test_unlabeled.json', 
                            model='text-embedding-3-small',
                            output_file_path='en_test_text_features.pkl',
                            task='subtask2a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_bg.json', 
                            model='text-embedding-3-small',
                            output_file_path='bg_test_text_features.pkl', 
                            task='subtask2a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_md.json', 
                            model='text-embedding-3-small',
                            output_file_path='md_test_text_features.pkl', 
                            task='subtask2a')

In [None]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_ar.json', 
                            model='text-embedding-3-small',
                            output_file_path='ar_test_text_features.pkl', 
                            task='subtask2a')

### Extract text features using text-embeddings-3-large for subtask 2a

In [0]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask2a/train.json', 
                            model='text-embedding-3-large',
                            output_file_path='train_text_features.pkl', 
                            task='subtask2a')

In [0]:
d = extract_openai_features(file_path='./semeval2024_dev_release/subtask2a/validation.json', 
                            model='text-embedding-3-large',
                            output_file_path='validation_text_features.pkl', 
                            task='subtask2a')

In [0]:
d = extract_openai_features(file_path='./test_data/english/en_subtask2a_test_unlabeled.json', 
                            model='text-embedding-3-large',
                            output_file_path='en_test_text_features.pkl',
                            task='subtask2a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_bg.json', 
                            model='text-embedding-3-large',
                            output_file_path='bg_test_text_features.pkl', 
                            task='subtask2a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_md.json', 
                            model='text-embedding-3-large',
                            output_file_path='md_test_text_features.pkl', 
                            task='subtask2a')

In [0]:
d = extract_openai_features(file_path='./test_labels_ar_bg_md_version2/test_subtask2a_ar.json', 
                            model='text-embedding-3-large',
                            output_file_path='ar_test_text_features.pkl', 
                            task='subtask2a')