In [32]:
import numpy as np
import pandas as pd
import os
from datasets import load_dataset

import torch
import torch.nn as nn
# from modules.lstm_encoder import LSTMEncoder

from collections import Counter



In [3]:
# Load the dataset
dataset_name = "go_emotions"
dataset = load_dataset(dataset_name)

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (/Users/heaven/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [6]:
# Extract train, dev, and test sets
X_train, y_train = dataset['train']['text'], dataset['train']['labels']
X_dev, y_dev = dataset['validation']['text'], dataset['validation']['labels']
X_test, y_test = dataset['test']['text'], dataset['test']['labels']

## Cleaning the Dataset

In [36]:
# import string
# from nltk.corpus import stopwords

# X_train = [x.lower() for x in X_train]

# X_train = [''.join(ch for ch in x if ch not in string.punctuation) for x in X_train]

## Creating embedding using Glove

1. Load GloVe Embeddings
2. Build Vocabulary from Your Dataset
3. Create a Mapping of Words to Unique IDs

In [30]:
class GloveTokenizer:
    def __init__(self, glove_file_path):
        self.word2id = {"<pad>": 0, "<unk>": 1}
        self.id2word = {0: "<pad>", 1: "<unk>"}
        self.word2vec = {}
        self.embeddings = []
        
        # Load GloVe vectors
        with open(glove_file_path, 'r', encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype=np.float32)
                self.word2vec[word] = vector
        
        # Prepopulate embeddings with pad and unk tokens
        embedding_dim = len(next(iter(self.word2vec.values())))
        self.embeddings.append(np.zeros(embedding_dim))
        self.embeddings.append(np.random.rand(embedding_dim))
        
    def build_vocab(self, data, vocab_size):
        # Count words
        word_counts = Counter()
        for text in data:
            tokens = text.split()
            word_counts.update(tokens)
        
        # Sort by frequency and take top vocab_size words
        for word, _ in word_counts.most_common(vocab_size - len(self.word2id)):
            if word not in self.word2id:
                self.word2id[word] = len(self.word2id)
                self.id2word[self.word2id[word]] = word
                if word in self.word2vec:
                    self.embeddings.append(self.word2vec[word])
                else:
                    self.embeddings.append(np.random.rand(embedding_dim))
                    
        self.embeddings = np.array(self.embeddings)
        
    def tokenize(self, text):
        return [self.word2id.get(word, self.word2id["<unk>"]) for word in text.split()]

    def decode(self, tokens):
        return " ".join([self.id2word[token] for token in tokens])

In [None]:
glove_file_path = "path_to_glove_vectors.txt"
tokenizer = GloveTokenizer(glove_file_path)
tokenizer.build_vocab(X_train, vocab_size=50000)

# To tokenize
X_train_tokenized = [tokenizer.tokenize(text) for text in X_train]
X_dev_tokenized = [tokenizer.tokenize(text) for text in X_dev]
X_test_tokenized = [tokenizer.tokenize(text) for text in X_test]

In [None]:
# To decode
decoded_text = tokenizer.decode(tokenized_train[0])