<a href="https://colab.research.google.com/github/harshalDharpure/Multimodality_Hateful_Meme/blob/main/roberat_dataset_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Modified for Hindi datasets Prompthate code

In [None]:
import os
import json
import numpy as np
import torch
from transformers import RobertaTokenizer
from torch.utils.data import Dataset
import config
import random

class RobertaDataset(Dataset):
    def __init__(self, opt, dataset, mode='train', few_shot_index=0):
        self.opt = opt
        self.tokenizer = RobertaTokenizer.from_pretrained(opt.MODEL_NAME)
        self.mode = mode
        self.dataset = dataset
        self.num_ans = opt.NUM_LABELS
        self.unimodal = opt.UNIMODAL

        if opt.FEW_SHOT:
            self.few_shot_index = str(few_shot_index)
            self.num_shots = opt.NUM_SHOTS
            print('Few shot learning setting for Iteration:', self.few_shot_index)
            print('Number of shots:', self.num_shots)

        self.length = opt.LENGTH

        self.entries = self.load_entries(mode)
        if opt.DEBUG:
            self.entries = self.entries[:128]

    def load_entries(self, mode):
        path = os.path.join(self.opt.DATA, 'domain_splits', f'{self.opt.DATASET}_{mode}.json')
        data = json.load(open(path, 'rb'))
        captions_path = os.path.join(self.opt.CAPTION_PATH, f'{self.opt.DATASET}_{self.opt.PRETRAIN_DATA}_{self.opt.IMG_VERSION}_captions.pkl')
        captions = self.load_pkl(captions_path)

        entries = []

        for row in data:
            label = row['label']
            img = row['img']
            cap = captions.get(img.split('.')[0], '')[:-1]  # remove the punctuation in the end
            sent = row['clean_sent']

            if not self.unimodal:
                cap = cap + ' . ' + sent
            else:
                cap = cap + ' . ' + sent + ' . '

            entry = {
                'cap': cap.strip(),
                'label': label,
                'img': img
            }
            entries.append(entry)

        return entries

    def process_tokens(self, sent):
        tokens = self.tokenizer.encode(sent, add_special_tokens=True, max_length=self.length, truncation=True, padding='max_length', return_attention_mask=True)
        return tokens

    def __getitem__(self, index):
        entry = self.entries[index]
        vid = entry['img']
        label = torch.tensor(entry['label'])
        target = torch.zeros(self.num_ans, dtype=torch.float32)
        target[label] = 1.0
        tokens = self.process_tokens(entry['cap'])
        tokens = torch.Tensor(tokens)

        batch = {
            'img': vid,
            'cap_tokens': tokens,
            'label': label,
            'target': target
        }

        if not self.unimodal:
            info = self.load_img_features(vid)
            feat = torch.from_numpy(info['features'])
            batch['feat'] = feat

        return batch

    def __len__(self):
        return len(self.entries)

    def load_pkl(self, path):
        with open(path, 'rb') as f:
            data = pickle.load(f)
        return data

    def load_img_features(self, vid):
        if self.dataset == 'mem':
            path = os.path.join(self.opt.DATA, 'multimodal-hate', 'mem', 'faster_hatefulmem_clean_36', vid.split('.')[0] + '.npy')
        else:
            path = os.path.join(self.opt.DATA, 'multimodal-hate', 'harm', 'clean_features', vid.split('.')[0] + '.npy')
        info = np.load(path, allow_pickle=True).item()
        return info
