## Import ibraries

In [1]:
import os
import json 
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, concatenate, Dropout, GlobalAveragePooling1D
from tensorflow.keras.callbacks import LearningRateScheduler
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification, AutoTokenizer, AutoModel
import numpy as np
import cv2
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

## Install requirement

In [4]:
!pip install einops

  pid, fd = os.forkpty()


Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


## Global variables

In [10]:
train_path = '/kaggle/input/vimmsd-dataset/training-images/train-images/'
test_path = '/kaggle/input/vimmsd-dataset/private-test-images/test-images/'
# test_path = '/kaggle/input/vimmsd-dataset/private-test-images/dev-images/'

In [11]:
train_json_path = '/kaggle/input/vimmsd-dataset/vimmsd-train.json'
test_json_path = '/kaggle/input/vimmsd-dataset/vimmsd-private-test.json'
# test_json_path = '/kaggle/input/vimmsd-dataset/vimmsd-public-test.json'

## Training

In [9]:
# Convert json to dataframe
def read_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
        data_list = [{**value} for key, value in data.items()]
        
        df = pd.DataFrame(data_list)
        
    return df

In [12]:
train_df = read_json(train_json_path)
train_df

Unnamed: 0,image,caption,label
0,8ae451edcd8ebf697f8763ece249115813149c55733bf8...,Cô ấy trên mạng vs cô ấy ngoài đời =))),multi-sarcasm
1,35370ffd6c791d6f8c4ab3dd4363ed468fab41e4824ee9...,Người tâm linh giao tiếp với người thực tế :))),not-sarcasm
2,316fdd1477725b9fb1a55015ac06b68b92b50bd4303e08...,Hình như Trăng hôm nay đẹp quá mọi người ạ! 😃 ...,multi-sarcasm
3,8a0f34e0e30e4e5cfb306933c1d25fa801a5da78646b59...,MỌI NGƯỜI NGHĨ SAO VỀ PHÁT BIỂU CỦA SHARK VIỆT...,not-sarcasm
4,e517a5e95d1065886a7c815e82fe254381d4f9f4b244d4...,2 tay hai nàng chứ việc gì phải lệ hai hàng,multi-sarcasm
...,...,...,...
10800,46ce5ad52085691fc81869c82e8222c0d737b34fedc2bd...,Lộn đầu rồi,not-sarcasm
10801,f816f7152cca9c5899f818ce681cf0949c6964ea2fc5ae...,"Chào các bạn, mình là Goda Takeshi. Trong live...",not-sarcasm
10802,0accae8d37f9edc90b5f0a2f5f3cca773f5d01b5124302...,Cre: Hùynh Quốc Huy,not-sarcasm
10803,bf125e295f85c0946940b789b2ba10106b2a85b9e70d88...,Anh hùng thật sự,not-sarcasm


In [14]:
# Main model class for a combined sarcasm classification system
class CombinedSarcasmClassifier:
    def __init__(self):
        """
        Initializes the classifier with pre-trained models for both image and text processing.
        Sets up the device and prepares the label mapping for classification.
        """
        self.model = None
        self.vit_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")  # Image processor for ViT
        self.vit_model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")  # Pre-trained ViT model
        
        self.jina_tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3")  # Tokenizer for Jina embeddings
        self.jina_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", 
                                                   trust_remote_code=True,
                                                   torch_dtype=torch.float32)  # Jina model for text embeddings
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Set up GPU/CPU
        self.label_mapping = {
            'image-sarcasm': 0,
            'multi-sarcasm': 1,
            'not-sarcasm': 2,
            'text-sarcasm': 3
        }
        
        # Move models to the appropriate device
        self.vit_model.to(self.device).to(torch.float32)
        self.jina_model.to(self.device).to(torch.float32)

    def encode_labels(self, labels):
        """
        Encodes string labels into one-hot numerical labels based on `label_mapping`.
        """
        numerical_labels = [self.label_mapping[label] for label in labels]
        return tf.keras.utils.to_categorical(numerical_labels, num_classes=len(self.label_mapping))
    
    def decode_labels(self, one_hot_labels):
        """
        Decodes one-hot numerical labels back into string labels based on `label_mapping`.
        """
        numerical_labels = np.argmax(one_hot_labels, axis=1)
        reverse_mapping = {v: k for k, v in self.label_mapping.items()}
        return [reverse_mapping[idx] for idx in numerical_labels]

    def build(self, image_dim=2024, text_dim=1024):
        """
        Builds the combined model with separate dense layers for image and text features,
        which are concatenated and passed through several dense layers to classify sarcasm types.
        """
        # Define input layers
        image_input = Input(shape=(image_dim,), name='image_input')
        text_input = Input(shape=(text_dim,), name='text_input')

        # Image branch
        image_dense = Dense(2048, activation='relu')(image_input)
        image_dropout = Dropout(0.2)(image_dense)
        image_dense2 = Dense(1024, activation='relu')(image_dropout)
        image_dropout2 = Dropout(0.2)(image_dense2)
        image_dense3 = Dense(512, activation='relu')(image_dropout2)

        # Text branch
        text_dense = Dense(1024, activation='relu')(text_input)
        text_dropout = Dropout(0.2)(text_dense)
        text_dense2 = Dense(512, activation='relu')(text_dropout)

        # Combined branch
        combined = concatenate([image_dense3, text_dense2])
        dense_combined = Dense(1024, activation='relu')(combined)
        dropout_combined = Dropout(0.2)(dense_combined)
        dense_combined2 = Dense(512, activation='relu')(dropout_combined)
        dropout_combined2 = Dropout(0.2)(dense_combined2)
        dense_combined3 = Dense(256, activation='relu')(dropout_combined2)
        dropout_combined3 = Dropout(0.2)(dense_combined3)

        # Output layer
        output = Dense(4, activation='softmax', name='output')(dropout_combined3)

        # Create and store the model
        self.model = Model(inputs=[image_input, text_input], outputs=output)

    def preprocess_data(self, images, texts, is_test=0):
        """
        Preprocesses image and text data into numerical features using the ViT model for images
        and the Jina model for text. Handles both training and testing data.
        """
        image_features = []
        total_images = len(images)
        
        # Determine the correct input file path based on train/test mode
        input_csv_file_path = ("/kaggle/input/ocr-text-dsc-2024/private_test_ocr.csv" if is_test else "/kaggle/input/ocr-text-dsc-2024/train_ocr.csv")
        
        # Load auxiliary OCR text data
        if os.path.exists(input_csv_file_path):
            df = pd.read_csv(input_csv_file_path)
            existing_images = df["image_name"].tolist()
            df["combined_text"] = df["combined_text"].fillna("").astype(str)
        else:
            raise FileNotFoundError(f"OCR CSV file not found at {input_csv_file_path}")

        # Process images
        print("\nProcessing images:")
        for i, image in enumerate(images, 1):
            try:
                print(f"Processing image {i}/{total_images}", end='\r')
                image_path = train_path + image if not is_test else test_path + image
                img = cv2.imread(image_path)
                
                # Extract image features using ViT
                inputs = self.vit_processor(images=img, return_tensors="pt").to(self.device)
                with torch.no_grad():
                    vit_outputs = self.vit_model(**inputs)
                vit_features = vit_outputs.logits.cpu().numpy().squeeze()

                # Combine with associated OCR text features if available
                if image in existing_images:
                    combined_text = df[df["image_name"] == image]["combined_text"].values[0]
                else:
                    combined_text = ""

                if combined_text.strip():
                    text_inputs = self.jina_tokenizer(
                        combined_text,
                        return_tensors="pt", 
                        padding="longest",
                        truncation=True, 
                        max_length=512
                    ).to(self.device)
                    
                    with torch.no_grad():
                        jina_outputs = self.jina_model(**text_inputs)
                    
                    # Use max pooling to extract text features
                    jina_features = torch.max(jina_outputs.last_hidden_state, dim=1).values.squeeze().cpu().numpy()
                    # Use mean pooling to extract text features
                    # jina_feature = jina_outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
                    combined_features = np.concatenate([vit_features, jina_features])
                else:
                    combined_features = np.concatenate([vit_features, np.zeros(1024)])

                image_features.append(combined_features)
            except Exception as e:
                print(f"\nError processing image {image}: {str(e)}")
                image_features.append(np.zeros(2024))

        # Process caption independently
        print("\nProcessing texts:")
        text_features = []
        total_texts = len(texts)
        for i, text in enumerate(texts, 1):
            try:
                print(f"Processing text {i}/{total_texts}", end='\r')
                text_inputs = self.jina_tokenizer(
                    text, 
                    return_tensors="pt", 
                    padding="longest",
                    truncation=True, 
                    max_length=512
                ).to(self.device)
                
                with torch.no_grad():
                    jina_outputs = self.jina_model(**text_inputs)
                # Use max pooling to extract caption features
                jina_feature = torch.max(jina_outputs.last_hidden_state, dim=1).values.squeeze().cpu().numpy()
                # Use mean pooling to extract caption features
                # jina_feature = jina_outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
                text_features.append(jina_feature)
            except Exception as e:
                print(f"\nError processing text: {str(e)}")
                text_features.append(np.zeros(1024))

        print("\nPreprocessing completed!")
        return np.array(image_features), np.array(text_features)

    def learning_rate_schedule(self, epoch, lr):
        """
        Adjusts the learning rate dynamically based on the epoch.
        """
        if 10 <= epoch < 20:
            return lr * 0.1
        elif 20 <= epoch < 35:
            return lr * 0.01
        elif 35 <= epoch:
            return lr * 0.001
        return lr
        
    def train(self, x_train_images, x_train_texts, y_train):
        """
        Trains the model using the provided image, text features, and labels.
        Performs train-validation split, applies class weighting, and runs training with a learning rate scheduler.
        """
        # Split the dataset for training and validation
        x_train_images, x_val_images, x_train_texts, x_val_texts, y_train, y_val = train_test_split(
            x_train_images, x_train_texts, y_train,
            test_size=0.2,
            stratify=y_train
        )

        # Compute and display class weights to handle imbalanced data
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(y_train),
            y=y_train
        )
        class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

        # Preprocess data for training and validation
        print("Starting preprocessing for training data...")
        image_features_train, text_features_train = self.preprocess_data(x_train_images, x_train_texts)
        image_features_val, text_features_val = self.preprocess_data(x_val_images, x_val_texts)

        y_train_encoded = self.encode_labels(y_train)
        y_val_encoded = self.encode_labels(y_val)

        initial_lr = 1e-4

        # Compile the model with label smoothing to mitigate overconfidence
        print("\nCompiling model with label smoothing...")
        self.model.compile(
            optimizer=tf.keras.optimizers.AdamW(learning_rate=initial_lr),
            loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.15),
            metrics=[tf.keras.metrics.AUC()]
        )

        # Define a progress callback and learning rate scheduler
        class BatchProgressCallback(tf.keras.callbacks.Callback):
            def on_epoch_begin(self, epoch, logs=None):
                print(f"\nEpoch {epoch + 1} starting...")

            def on_batch_begin(self, batch, logs=None):
                print(f"Training batch {batch + 1}", end='\r')

        lr_scheduler = LearningRateScheduler(self.learning_rate_schedule)

        # Train the model
        print("\nStarting training...")
        history = self.model.fit(
            [image_features_train, text_features_train],
            y_train_encoded,
            validation_data=([image_features_val, text_features_val], y_val_encoded),
            epochs=35,
            batch_size=40,
            class_weight=class_weights_dict,
            callbacks=[BatchProgressCallback(), lr_scheduler]
        )

        print("\nTraining completed!")
        return history

    def predict(self, x_test_images, x_test_texts):
        """
        Predicts sarcasm categories for the given test images and texts.
        """
        print("Preprocessing test data...")
        image_features, text_features = self.preprocess_data(x_test_images, x_test_texts, 1)
        print("Making predictions...")
        predictions = self.model.predict([image_features, text_features])
        return self.decode_labels(predictions)

    def load(self, model_file):
        """
        Loads a previously saved model from a file.
        """
        self.model = load_model(model_file)

    def save(self, model_file):
        """
        Saves the current model to a file for later use.
        """
        self.model.save(model_file)

    def summary(self):
        """
        Displays the summary of the model architecture.
        """
        self.model.summary()


In [15]:
x_train_images = train_df['image']
x_train_texts = train_df['caption']
y_train = train_df['label']

In [16]:
x_train_images

0        8ae451edcd8ebf697f8763ece249115813149c55733bf8...
1        35370ffd6c791d6f8c4ab3dd4363ed468fab41e4824ee9...
2        316fdd1477725b9fb1a55015ac06b68b92b50bd4303e08...
3        8a0f34e0e30e4e5cfb306933c1d25fa801a5da78646b59...
4        e517a5e95d1065886a7c815e82fe254381d4f9f4b244d4...
                               ...                        
10800    46ce5ad52085691fc81869c82e8222c0d737b34fedc2bd...
10801    f816f7152cca9c5899f818ce681cf0949c6964ea2fc5ae...
10802    0accae8d37f9edc90b5f0a2f5f3cca773f5d01b5124302...
10803    bf125e295f85c0946940b789b2ba10106b2a85b9e70d88...
10804    a8bd707f12b5f47bbb42b501eb1ae896c22a474155ec0d...
Name: image, Length: 10805, dtype: object

In [17]:
classifier = CombinedSarcasmClassifier()
classifier.build()
classifier.summary()

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- mlp.py
- xlm_padding.py
- mha.py
- block.py
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

In [18]:
test_df = read_json(test_json_path)
test_df

Unnamed: 0,image,caption,label
0,066d6021fdfeaf39f1dec523879e8fe4d35e877abcea44...,Song Joong Ki &amp; Song Hye Kyo đều tham dự B...,
1,555f4787d4df49e7be743b3d5b77c90755f0d6c351f36b...,Song Joong Ki &amp; Song Hye Kyo đều tham dự B...,
2,7b7cdea2cde1f3f93371259b587a03f2e8c0af682b4d51...,Song Joong Ki &amp; Song Hye Kyo đều tham dự B...,
3,80167e59d729cf3aaba5d2d3da40db6995cb8a6a8c4a88...,Song Joong Ki &amp; Song Hye Kyo đều tham dự B...,
4,59db087307031d60755af3a5c01a44ba55a04bfab21027...,Ngang trái thậc 🤣,
...,...,...,...
1499,3c643826258f8aacc8a98d8e24956f909797010f1e80bd...,Mẹ biết mẹ buồn ó 🐧,
1500,09f5adf3e555d3066eae0be356e5ce797c066706e7c808...,Ủa ăn đi anh sao tự nhiên rén vậy? \n#KFCViet...,
1501,b11515c1aa521da4f6d0cd6464e5ea3e030662cdd43f7f...,Mẹ không phát hiện sớm là vài tháng nữa con Ch...,
1502,8a83634808704a5c7493327893f793effd6e78cb037ac4...,Anh em h.út ch.ích gánh còng lưng :(((,


In [19]:
classifier.train(x_train_images, x_train_texts, y_train)

Label distribution in training data:
{'image-sarcasm': 353, 'multi-sarcasm': 3379, 'not-sarcasm': 4850, 'text-sarcasm': 62}
Class weights:
image-sarcasm: 6.1218
multi-sarcasm: 0.6395
not-sarcasm: 0.4456
text-sarcasm: 34.8548
Starting preprocessing for training data...

Processing images:
Processing image 8644/8644
Processing texts:
Processing text 8644/8644
Preprocessing completed!

Processing images:
Processing image 2161/2161
Processing texts:
Processing text 2161/2161
Preprocessing completed!

Compiling model with label smoothing...

Starting training...

Epoch 1 starting...
Epoch 1/35
Training batch 1

I0000 00:00:1736831164.477173      72 service.cc:145] XLA service 0x7ed718012c60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1736831164.477226      72 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 21/217[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 8ms/step - auc: 0.5487 - loss: 1.8318

I0000 00:00:1736831174.923544      72 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 57ms/step - auc: 0.6410 - loss: 1.4799 - val_auc: 0.6414 - val_loss: 1.3400 - learning_rate: 1.0000e-04

Epoch 2 starting...
Epoch 2/35
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - auc: 0.6539 - loss: 1.4057 - val_auc: 0.7566 - val_loss: 1.2506 - learning_rate: 1.0000e-04

Epoch 3 starting...
Epoch 3/35
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - auc: 0.6183 - loss: 1.3419 - val_auc: 0.6936 - val_loss: 1.3014 - learning_rate: 1.0000e-04

Epoch 4 starting...
Epoch 4/35
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - auc: 0.6978 - loss: 1.2125 - val_auc: 0.7933 - val_loss: 1.1721 - learning_rate: 1.0000e-04

Epoch 5 starting...
Epoch 5/35
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - auc: 0.7155 - loss: 1.3071 - val_auc: 0.6334 - val_loss: 1.3838 - learning_rate: 1.0000e-04

Epoch 6 starting...


<keras.src.callbacks.history.History at 0x7ed783f77910>

## Prediction

In [20]:
x_test_images = test_df['image']
x_test_texts = test_df['caption']

In [21]:
predictions = classifier.predict(x_test_images, x_test_texts)

Preprocessing test data...

Processing images:
Processing image 1504/1504
Processing texts:
Processing text 1504/1504
Preprocessing completed!
Making predictions...
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [23]:
results = {str(i): pred for i, pred in enumerate(predictions)}

output = {
    "results": results,
    "phase": 'test'
}

with open('results.json', 'w') as f:
    json.dump(output, f, indent=2)