# Install packages and download dataset

In [None]:
# https://huggingface.co/google-bert/bert-base-multilingual-cased

In [None]:
!pip install transformers torch
!pip install datasets

In [None]:
!pip install accelerate -U

# Build a new dataset with description and reference

In [None]:
# Download EXIST dataset
# Please manually download and extract the EXIST_dataset.zip to ./data/ directory
# The dataset should contain GPT descriptions and initial answers

In [None]:
# Download EXIST 2024 Memes Dataset
# Please manually download and extract the EXIST_2024_Memes_Dataset.zip to ./data/ directory

In [None]:
original_training_path = './data/EXIST 2024 Memes Dataset/training/EXIST2024_training.json'

In [None]:
GPT_english_path = './data/EXIST_dataset/English_Meme_GPT_Description_training.json'
GPT_spanish_path = './data/EXIST_dataset/Spanish_Meme_GPT_Description_training.json'

In [None]:
import json

def build_datasets(input_path_en, input_path_sp, input_path_original, output_path):
    with open(input_path_en, 'r') as file:
        data_en = json.load(file)
    with open(input_path_sp, 'r') as file:
        data_sp = json.load(file)
    with open(input_path_original, 'r') as file:
        data_original = json.load(file)

    for key, value in data_en.items():
        image_id = key.split('.')[0]
        if image_id in data_original:
            data_original[image_id]['description'] = value['description']
            data_original[image_id]['reference'] = value['reference']
        else:
            print(f'EN ID {image_id} not found in the original file.')

    for key, value in data_sp.items():
        image_id = key.split('.')[0]
        if image_id in data_original:
            data_original[image_id]['description'] = value['description']
            data_original[image_id]['reference'] = value['reference']
        else:
            print(f'SP ID {image_id} not found in the original file.')

    with open(output_path, 'w') as file:
        json.dump(data_original, file, ensure_ascii=False, indent=4)

    print("Data has been merged and saved in", output_path)


In [None]:
build_datasets(GPT_english_path, GPT_spanish_path, original_training_path, "./data/new_dataset_training.json")

# Split english memes and spanish memes to training and test dataset seperately

In [None]:
# Split the json file to english and spanish seperately
def split_json_by_key(input_path, output_path_spanish, output_path_english):
    # 读取原始JSON文件
    with open(input_path, 'r') as file:
        data = json.load(file)

    # 初始化两个字典来存储分类后的数据
    spanish_memes = {}
    english_memes = {}

    # 遍历原始数据，根据key的首位数字进行分类
    for key, value in data.items():
        if key.startswith('1'):
            spanish_memes[key] = value
        elif key.startswith('2'):
            english_memes[key] = value

    # 写入Spanish memes数据到新的JSON文件
    with open(output_path_spanish, 'w', encoding='utf-8') as file:
        json.dump(spanish_memes, file, ensure_ascii=False, indent=4)

    # 写入English memes数据到新的JSON文件
    with open(output_path_english, 'w', encoding='utf-8') as file:
        json.dump(english_memes, file, ensure_ascii=False, indent=4)

    print("Data has been successfully split and saved.")

In [None]:
split_json_by_key('./data/new_dataset_training.json', './data/spanish_memes.json', './data/english_memes.json')

In [None]:
# Download ground truth data
# Please manually download EXIST2024_training_task4_gold_hard.json to ./data/ directory

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

# 将gold数据和现有的数据结合
def process_data(main_json_path, ground_truth_json_path, test_size=0.2):
    # 从文件中读取JSON数据
    with open(main_json_path, 'r') as file:
        main_data = json.load(file)
    with open(ground_truth_json_path, 'r') as file:
        ground_truth_data = json.load(file)

    # 转换列表中的字典为DataFrame
    main_df = pd.DataFrame.from_dict(main_data, orient='index')
    ground_truth_df = pd.DataFrame(ground_truth_data)

    # 假设ground_truth_df的数据已正确设置id为索引
    ground_truth_df.set_index('id', inplace=True)
    ground_truth_df.rename(columns={'value': 'gold'}, inplace=True)

    # 合并两个 DataFrame
    complete_df = main_df.join(ground_truth_df, how='left', rsuffix='_gt')

    # 返回处理后的训练集和测试集
    return complete_df

In [None]:
# 调用这个函数处理数据
# 首先分开处理english和spanish数据集
spanish_df = process_data('./data/spanish_memes.json', './data/EXIST2024_training_task4_gold_hard.json')
english_df = process_data('./data/english_memes.json', './data/EXIST2024_training_task4_gold_hard.json')

In [None]:
english_df

In [None]:
spanish_df.head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def clean_df(df):

    # 映射 'gold' 列并立即删除任何未映射的值（即映射之外的值导致的 NaN）
    label_mapping = {'YES': 1, 'NO': 0}
    df['gold'] = df['gold'].map(label_mapping, na_action='ignore')  # 使用 na_action='ignore' 保留 NaNs 以便于检测
    df.dropna(subset=['gold'], inplace=True)  # 删除映射后仍为 NaN 的行

    # 确保 text 和 description 列为字符串类型
    df['text'] = df['text'].astype(str)
    df['description'] = df['description'].astype(str)

    # 合并 'text' 和 'description' 列
    df['combined_text'] = df['text'] + " " + df['description']

    # 提取特征和标签
    X = df['combined_text'].values
    y = df['gold'].values.astype(int)  # 转换为整数
    # # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# Fine-tuning using mBERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import EarlyStoppingCallback

In [None]:
from datasets import load_metric
import numpy as np

f1_metric = load_metric("f1")
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1_result = f1_metric.compute(predictions=predictions, references=labels, average='binary')

    accuracy_result = accuracy_metric.compute(predictions=predictions, references=labels)

    return {
        "f1": f1_result['f1'],
        "accuracy": accuracy_result['accuracy']
    }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
X_train_spanish,X_test_spanish, y_train_spanish, y_test_spanish = clean_df(spanish_df)

In [None]:
print("Number of entries in X_train_spanish:", len(X_train_spanish))
print("Number of entries in X_test_spanish:", len(X_test_spanish))
print("Number of entries in y_train_spanish:", len(y_train_spanish))
print("Number of entries in y_test_spanish:", len(y_test_spanish))

In [None]:
X_train_english, X_test_english, y_train_english, y_test_english = clean_df(english_df)

In [None]:
print("Number of entries in X_train_english:", len(X_train_english))
print("Number of entries in X_test_english:", len(X_test_english))
print("Number of entries in y_train_english:", len(y_train_english))
print("Number of entries in y_test_english:", len(y_test_english))

In [None]:
# Concatenate training features (texts) from both languages
X_train_combined = np.concatenate((X_train_spanish, X_train_english))
X_test_combined = np.concatenate((X_test_spanish, X_test_english))

# Concatenate training labels from both languages
y_train_combined = np.concatenate((y_train_spanish, y_train_english))
y_test_combined = np.concatenate((y_test_spanish, y_test_english))

print("Combined X_train size:", len(X_train_combined))
print("Combined X_test size:", len(X_test_combined))
print("Combined y_train size:", len(y_train_combined))
print("Combined y_test size:", len(y_test_combined))

In [None]:
import numpy as np

np.savetxt(f'./data/english_train_X.txt', X_train_english, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/english_test_X.txt', X_test_english, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/english_train_y.txt', y_train_english, fmt='%d')
np.savetxt(f'./data/english_test_y.txt', y_test_english, fmt='%d')

In [None]:
np.savetxt(f'./data/spanish_train_X.txt', X_train_spanish, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/spanish_test_X.txt', X_test_spanish, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/spanish_train_y.txt', y_train_spanish, fmt='%d')
np.savetxt(f'./data/spanish_test_y.txt', y_test_spanish, fmt='%d')

In [None]:
np.savetxt(f'./data/combined_train_X.txt', X_train_combined, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/combined_test_X.txt', X_test_combined, fmt='%s', encoding='utf-8')
np.savetxt(f'./data/combined_train_y.txt', y_train_combined, fmt='%d')
np.savetxt(f'./data/combined_test_y.txt', y_test_combined, fmt='%d')

In [None]:
tokenizer_mbert = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_mbert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model_mbert.to(device)

### Fine-tune on the Spanish dataset

In [None]:
train_dataset = TextDataset(X_train_spanish, y_train_spanish, tokenizer_mbert)
test_dataset = TextDataset(X_test_spanish, y_test_spanish, tokenizer_mbert)


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    load_best_model_at_end=True
)


trainer = Trainer(
    model=model_mbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


trainer.train()

### Fine-tune on the English dataset

In [None]:
train_dataset = TextDataset(X_train_english, y_train_english, tokenizer_mbert)
test_dataset = TextDataset(X_test_english, y_test_english, tokenizer_mbert)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
)

trainer = Trainer(
    model=model_mbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# Fine-tune using XLM-RoBERTa

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

tokenizer_xlm = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model_xlm = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
model_xlm.to(device)

### Fine-tune on Spanish dataset

In [None]:
train_dataset = TextDataset(X_train_spanish, y_train_spanish, tokenizer_xlm)
test_dataset = TextDataset(X_test_spanish, y_test_spanish, tokenizer_xlm)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
)

trainer = Trainer(
    model=model_xlm,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 开始训练
trainer.train()

### Fine-tune on English dataset

In [None]:
train_dataset = TextDataset(X_train_english, y_train_english, tokenizer_xlm)
test_dataset = TextDataset(X_test_english, y_test_english, tokenizer_xlm)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
)

trainer = Trainer(
    model=model_xlm,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()