In [10]:
import torch
from torch.utils.data import Dataset
from torch_geometric.loader import DataLoader
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
import sys

sys.path.append('../src/')

from GNNs.GCNClassifier import GCNClassifier
from GNNs.GRNClassifier import GRNClassifier

from embeddings.word2vec import Word2VecEmbedding
from embeddings.sbert import SBERTEmbedding

from utils.graph_of_words import GraphOfWords
from utils.graph_to_data import GraphToData
from utils.dataset_wrapper import DatasetWrapper


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from datasets import load_dataset

# TODO: come up with different method to load the dataset; as this one doesn't work
dataset = load_dataset("Hate-speech-CNERG/hatexplain", trust_remote_code=True)


In [None]:
from collections import Counter

def get_majority_label(annotators):
    if isinstance(annotators, list) and all(isinstance(ann, dict) for ann in annotators):
        labels = [ann['label'] for ann in annotators if 'label' in ann]
        most_common = Counter(labels).most_common(1)
        return most_common[0][0] if most_common else None
    return None

# Process the dataset to extract texts and labels
# def process_dataset(split):
#     texts = []
#     labels = []
#     for example in dataset[split]:
#         label = get_majority_label(example['annotators'])
#         if label is not None:
#             text = ' '.join(example['post_tokens'])
#             texts.append(text)
#             labels.append(label)
#     return texts, labels

def process_dataset(split):
    texts, labels = [], []
    for example in dataset[split]:
        label = example.get('label')  # already an int 0, 1, or 2
        if label is not None:
            text = ' '.join(example['post_tokens'])
            texts.append(text)
            labels.append(label)
    return texts, labels

train_texts, train_labels = process_dataset('train')
val_texts, val_labels = process_dataset('validation')
test_texts, test_labels = process_dataset('test')

print(f"Train size: {len(train_texts)}")
print(f"Val size: {len(val_texts)}")
print(f"Test size: {len(test_texts)}")


Train size: 0
Val size: 0
Test size: 0


In [13]:
w2v_embedder = Word2VecEmbedding("../models/google/GoogleNews-vectors-negative300.kv", device=device)
sbert_embedder = SBERTEmbedding(device=device)
gow = GraphOfWords(embedding_model=w2v_embedder, window_size=2)
text_to_graph = GraphToData(gow)

In [14]:
from torch_geometric.loader import DataLoader

train_dataset = DatasetWrapper(train_texts, train_labels, text_to_graph)
val_dataset = DatasetWrapper(val_texts, val_labels, text_to_graph)
test_dataset = DatasetWrapper(test_texts, test_labels, text_to_graph)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

ValueError: num_samples should be a positive integer value, but got num_samples=0