In [None]:
pip install transformers torch pandas scikit-learn jieba

Text Preprocessing Function
]

In [None]:
import pandas as pd
import re
import jieba
from sklearn.model_selection import train_test_split

# Load datasets
train_csv = pd.read_csv('/kaggle/input/misogyny-meme-detection/train.csv')
dev_csv = pd.read_csv('/kaggle/input/misogyny-meme-detection/dev.csv')
test_csv= pd.read_csv('/kaggle/input/misogyny-meme-detection/test.csv')

# Predefined list of Chinese stopwords
chinese_stopwords = set([
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"
])


# Preprocess function for Chinese text
def preprocess_text(text, stopwords):
    # Remove URLs, special characters, etc.
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers

    # Tokenize using jieba
    words = jieba.lcut(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords and len(word) > 1]
    return ' '.join(words)

# Apply preprocessing to the 'transcriptions' column
train_csv['transcriptions'] = train_csv['transcriptions'].apply(lambda x: preprocess_text(x, chinese_stopwords))
dev_csv['transcriptions'] = dev_csv['transcriptions'].apply(lambda x: preprocess_text(x, chinese_stopwords))
test_csv['transcriptions'] = test_csv['transcriptions'].apply(lambda x: preprocess_text(x, chinese_stopwords))

# Map string labelsss to integers
labels_map = {
    "Misogyny": 1,
    "Not-Misogyny": 0
}
train_csv['labels'] = train_csv['labels'].map(labels_map)
dev_csv['labels'] = dev_csv['labels'].map(labels_map)

# Verify the labelsss
print("Train labels:", train_csv['labels'].unique())
print("Dev labels:", dev_csv['labels'].unique())

Image Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

In [None]:
import cv2
import os
import pandas as pd
from PIL import Image
import numpy as np
import imghdr
import re

def load_images_to_dataframe(image_dir):
    data = []

    for file_name in os.listdir(image_dir):
        file_path = os.path.join(image_dir, file_name)

        # Check if it's an image file
        if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')):
            try:
                img = None  # Initialize image variable

                # Try reading the image using OpenCV
                img = cv2.imread(file_path, cv2.IMREAD_UNCHANGED)

                # If OpenCV fails, try PIL
                if img is None:
                    print(f"OpenCV failed to read {file_name}, trying PIL...")
                    img = Image.open(file_path).convert("RGB")  # Try using PIL instead
                    img = img.resize((224, 224))
                    img = np.array(img)
                else:
                    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)

                # Apply brightness/contrast adjustment
                alpha = 1.2
                beta = 20
                img = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)

                # Keep the full image name with extension
                image_name = file_name  # Keep the full filename including extension (e.g., 1549.jpg)

                # Append the image data and its filename to the list
                data.append({'image_name': image_name, 'image_data': img})

            except Exception as e:
                print(f"Skipping {file_name}: {e}")
                continue

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

In [None]:
df_train = load_images_to_dataframe('/kaggle/input/misogyny-meme-detection/train-20250320T200243Z-001/train')
df_test = load_images_to_dataframe('/kaggle/input/misogyny-meme-detection/test-20250327T173652Z-001/test')
df_dev = load_images_to_dataframe('/kaggle/input/misogyny-meme-detection/dev-20250321T154338Z-001/dev')

In [None]:
train = pd.merge(train_csv, df_train, on='image_name', how='inner')
dev = pd.merge(dev_csv, df_dev, on='image_name', how='inner')
test = pd.merge(test_csv, df_test, on='image_name', how='inner')

In [None]:
test.head()

In [None]:
from PIL import Image
from torchvision import transforms
import numpy as np
import pandas as pd

augmented_data = []
cnt = 0


img_augmentations = transforms.Compose([
    transforms.ColorJitter(brightness=0.5),
    transforms.RandomGrayscale(p=0.2),
    transforms.RandomPosterize(bits=4),

])

for idx, row in train.iterrows():
    image_data = row['image_data']
    text = row['transcriptions']
    labels = row['labels']
    image_name = row['image_name']

    if labels == 1:
        cnt += 1
        # print(cnt)


        image = Image.fromarray(image_data.astype('uint8'))
        img_aug = img_augmentations(image)
        img_aug = np.array(img_aug)

        augmented_data.append({
            'image_data': img_aug,
            'labels': labels,
            'image_name': image_name,
            'transcriptions': text
        })


        image = Image.fromarray(image_data.astype('uint8'))
        img_aug = img_augmentations(image)
        img_aug = np.array(img_aug)

        augmented_data.append({
            'image_data': img_aug,
            'labels': labels,
            'image_name': image_name,
            'transcriptions': text
        })


augmented_df = pd.DataFrame(augmented_data)


train = pd.concat([train, augmented_df], ignore_index=True)

ChineseBERT+ResNet

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from PIL import Image


class MultiModalDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.image_transform = image_transform


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        label = self.df.loc[idx, 'labels']
        image = self.df.loc[idx, 'image_data']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

        image = Image.fromarray(image)
        image = self.image_transform(image)

        return encoding, image, torch.tensor(label)


# Load Chinese BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')
bert_model = BertModel.from_pretrained('hfl/chinese-bert-wwm')

# Load ResNet-50 model
resnet_model = models.resnet50(pretrained=True)
resnet_model.fc = nn.Identity()  # Remove classification head

# Image transformation
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


class MultiModal(nn.Module):
    def __init__(self, bert_model, resnet_model, num_classes):
        super(MultiModal, self).__init__()
        self.bert_model = bert_model
        self.resnet_model = resnet_model
        self.fc = nn.Linear(768 + 2048, num_classes)  # 768 from BERT, 2048 from ResNet-50

    def forward(self, input_ids, attention_mask, images):
        bert_output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        bert_pool = torch.mean(bert_output, 1)

        resnet_output = self.resnet_model(images)

        combined_features = torch.cat((bert_pool, resnet_output), dim=1)
        output = self.fc(combined_features)
        return output


train_dataset = MultiModalDataset(train, tokenizer, image_transform)
dev_dataset = MultiModalDataset(dev, tokenizer, image_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16)

num_classes = len(train['labels'].unique())
model = MultiModal(bert_model, resnet_model, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(20):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in dev_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask, images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total}%")


In [None]:
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []

    for batch in dev_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask, images)
        _, predicted = torch.max(outputs, 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    accuracy = 100 * np.mean(np.array(y_true) == np.array(y_pred))
    f1 = f1_score(y_true, y_pred, average='macro')
    class_report = classification_report(y_true, y_pred)

    print(f"Accuracy: {accuracy:.2f}%")
    print(f"F1-score: {f1:.4f}")
    print("Classification Report:")
    print(class_report)

In [None]:
test

In [None]:
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import pandas as pd
import torch
import numpy as np


class TestDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        image = self.df.loc[idx, 'image_data']


        encoding = self.tokenizer(
            text, padding='max_length', truncation=True, max_length=512, return_tensors='pt'
        )


        image = Image.fromarray(image)
        image = self.image_transform(image)

        return encoding, image

test_dataset = TestDataset(test, tokenizer, image_transform)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()

test_predictions = []

with torch.no_grad():
    for batch in test_loader:

        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        pixel_values = batch[1].to(device)

        outputs = model(input_ids, attention_mask, pixel_values)
        _, predicted = torch.max(outputs, 1)


        test_predictions.extend(predicted.cpu().numpy())


test_predictions = np.array(test_predictions)


test['predictions'] = test_predictions


predictions_df = pd.DataFrame({
    'image_name': test['image_name'],
    'predictions': test_predictions
})

output_file_path = '/kaggle/working/chineseBert_ResNet_predictions.csv'
predictions_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

In [None]:
# Remove everything after the first dot (extension)
test['image_name'] = test['image_name'].str.split('.').str[0]

# Convert image names to numeric values (assuming they are numeric)
test['image_name'] = pd.to_numeric(test['image_name'], errors='coerce')

# Create a DataFrame for predictions
predictions_df = pd.DataFrame({
    'image_name': test['image_name'],
    'predictions': test_predictions
})

# Sort by 'image_name' numerically
predictions_df = predictions_df.sort_values(by='image_name')

# Remove the headers (columns) from the CSV
predictions_df.to_csv('/kaggle/working/chineseBert_ResNet_predictions_submission.csv', header=False, index=False)

print("Predictions saved without headers and sorted numerically by image_name")

DenseNet+ ChineseBERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from PIL import Image

class MultiModalDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        label = self.df.loc[idx, 'labels']
        image = self.df.loc[idx, 'image_data']

        # Tokenize text (Chinese BERT)
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Process image
        image = Image.fromarray(image)
        image = self.image_transform(image)

        return encoding, image, torch.tensor(label)

# Load Chinese BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')
bert_model = BertModel.from_pretrained('hfl/chinese-bert-wwm')

# Load DenseNet-121
densenet_model = models.densenet121(pretrained=True)
densenet_model.classifier = nn.Identity()  # Remove classification head

# Image transformation (same as ResNet)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class MultiModalModel(nn.Module):
    def __init__(self, bert_model, densenet_model, num_classes):
        super(MultiModalModel, self).__init__()
        self.bert_model = bert_model
        self.densenet_model = densenet_model

        # BERT hidden size = 768, DenseNet-121 output = 1024
        self.fc = nn.Linear(768 + 1024, num_classes)

    def forward(self, input_ids, attention_mask, images):
        # Text features (BERT)
        bert_output = self.bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).last_hidden_state
        bert_pooled = torch.mean(bert_output, dim=1)  # Pooling (mean)

        # Image features (DenseNet)
        densenet_features = self.densenet_model(images)

        # Combine features
        combined = torch.cat((bert_pooled, densenet_features), dim=1)
        output = self.fc(combined)
        return output

# Example usage (assuming `train` and `dev` DataFrames exist)
num_classes = len(train['labels'].unique())
model = MultiModalModel(bert_model, densenet_model, num_classes)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# DataLoaders
train_dataset = MultiModalDataset(train, tokenizer, image_transform)
dev_dataset = MultiModalDataset(dev, tokenizer, image_transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16)

# Training loop
for epoch in range(20):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in dev_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask, images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total}%")

In [None]:
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []

    for batch in dev_loader:
        # Move batch to device (handles edge cases like single-sample batches)
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask, images)
        _, predicted = torch.max(outputs, 1)  # Get class indices

        # Ensure tensors are CPU-side for sklearn
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred) * 100  # More reliable than np.mean
    f1_macro = f1_score(y_true, y_pred, average='macro')  # Macro-average for imbalanced classes
    f1_weighted = f1_score(y_true, y_pred, average='weighted')  # Weighted-average (class-sensitive)
    class_report = classification_report(y_true, y_pred, digits=4)  # 4 decimal places

    print(f"Accuracy: {accuracy:.2f}%")
    print(f"F1-score (Macro): {f1_macro:.4f}")
    print(f"F1-score (Weighted): {f1_weighted:.4f}")
    print("Detailed Classification Report:")
    print(class_report)

In [None]:
test

In [None]:
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import pandas as pd
import torch
import numpy as np
import os  # For path handling

class TestDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df.reset_index(drop=True)  # Ensure consistent indexing
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        image = self.df.loc[idx, 'image_data']
--int(f"Predictions saved to {output_path}")

In [None]:
# First ensure the column contains strings by converting it
test['image_name'] = test['image_name'].astype(str)

# Now safely remove everything after the first dot
test['image_name'] = test['image_name'].str.split('.').str[0]

# Convert to numeric (if needed)
test['image_name'] = pd.to_numeric(test['image_name'], errors='coerce')

# Drop any rows that couldn't be converted to numbers
test = test.dropna(subset=['image_name'])

# Create predictions DataFrame
predictions_df = pd.DataFrame({
    'image_name': test['image_name'].astype(int),
    'predictions': test_predictions
})

# Sort numerically
predictions_df = predictions_df.sort_values('image_name')

# Save without headers
predictions_df.to_csv('/kaggle/working/chineseBert_DenseNet_predictions_submission.csv',
                     header=False,
                     index=False)

DenseNet+XLM-R

In [None]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from PIL import Image
from sklearn.metrics import f1_score


class MultiModalDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        label = self.df.loc[idx, 'labels']
        image = self.df.loc[idx, 'image_data']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        image = Image.fromarray(image)
        image = self.image_transform(image)

        return encoding, image, torch.tensor(label)


# Load XLM-R tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

# Load DenseNet model
densenet_model = models.densenet121(pretrained=True)
densenet_model.classifier = nn.Identity()  # Remove classification head

# Image transformation
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


class MultiModal(nn.Module):
    def __init__(self, xlmr_model, densenet_model, num_classes):
        super(MultiModal, self).__init__()
        self.xlmr_model = xlmr_model
        self.densenet_model = densenet_model
        self.fc = nn.Linear(768 + 1024, num_classes)  # 768 from XLM-R, 1024 from DenseNet-121

    def forward(self, input_ids, attention_mask, images):
        xlmr_output = self.xlmr_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        xlmr_pool = torch.mean(xlmr_output, 1)

        densenet_output = self.densenet_model(images)

        combined_features = torch.cat((xlmr_pool, densenet_output), dim=1)
        output = self.fc(combined_features)
        return output
train_dataset = MultiModalDataset(train, tokenizer, image_transform)
dev_dataset = MultiModalDataset(dev, tokenizer, image_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16)

num_classes = len(train['labels'].unique())
model = MultiModal(xlmr_model, densenet_model, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(20):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    correct = 0
    total = 0
    for batch in dev_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask, images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    accuracy = 100 * correct / total
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"Accuracy: {accuracy}%")
    print(f"Macro F1 Score: {f1}")

In [None]:
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score

model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []

    for batch in dev_loader:
        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        images = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
        _, predicted = torch.max(outputs, dim=1)

        # Store true and predicted labels
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    # Compute accuracy and F1-score
    accuracy = 100 * np.mean(np.array(y_true) == np.array(y_pred))
    f1 = f1_score(y_true, y_pred, average='macro')
    class_report = classification_report(y_true, y_pred)

    # Print results
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"F1-score: {f1:.4f}")
    print("Classification Report:")
    print(class_report)

In [None]:
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import pandas as pd
import torch
import numpy as np


class TestDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, 'transcriptions']
        image = self.df.loc[idx, 'image_data']


        encoding = self.tokenizer(
            text, padding='max_length', truncation=True, max_length=512, return_tensors='pt'
        )


        image = Image.fromarray(image)
        image = self.image_transform(image)

        return encoding, image

test_dataset = TestDataset(test, tokenizer, image_transform)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()

test_predictions = []

with torch.no_grad():
    for batch in test_loader:

        input_ids = batch[0]['input_ids'].squeeze(1).to(device)
        attention_mask = batch[0]['attention_mask'].squeeze(1).to(device)
        pixel_values = batch[1].to(device)

        outputs = model(input_ids, attention_mask, pixel_values)
        _, predicted = torch.max(outputs, 1)


        test_predictions.extend(predicted.cpu().numpy())


test_predictions = np.array(test_predictions)


test['predictions'] = test_predictions


predictions_df = pd.DataFrame({
    'image_name': test['image_name'],
    'predictions': test_predictions
})

output_file_path = '/kaggle/working/xlmr_denseNet.csv'
predictions_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

In [None]:
# Remove everything after the first dot (extension)
test['image_name'] = test['image_name'].str.split('.').str[0]

# Convert image names to numeric values (assuming they are numeric)
test['image_name'] = pd.to_numeric(test['image_name'], errors='coerce')

# Create a DataFrame for predictions
predictions_df = pd.DataFrame({
    'image_name': test['image_name'],
    'predictions': test_predictions
})

# Sort by 'image_name' numerically
predictions_df = predictions_df.sort_values(by='image_name')

# Remove the headers (columns) from the CSV
predictions_df.to_csv('/kaggle/working/chineseBert_ResNet_predictions_submission.csv', header=False, index=False)

print("Predictions saved without headers and sorted numerically by image_name")