In [1]:
import pathlib
import pandas as pd

from torchtext.data.utils import get_tokenizer
from datasets import load_dataset, DatasetDict

In [2]:
class Tokenizer:
    def __init__(self, hash_size, n_hashes):
        self.hash_size = hash_size 
        self.n_hashes = n_hashes
        self.splitter = get_tokenizer('basic_english')
        self.vocab_size = hash_size * n_hashes
    
    def __call__(self, text):
        integers = []
        words = self.splitter(text)
        for h in range(self.n_hashes):
            integers += [hash(w + 'a'*h) % self.hash_size + h*self.hash_size for w in words]
        return integers
    
tok = Tokenizer(hash_size=200, n_hashes=4)
tok("hello there")

[47, 57, 264, 290, 539, 499, 723, 606]

In [3]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, name='silicone', subset='dyda_da', split='train'):
        self.dataset = load_dataset(name, subset)
        if isinstance(self.dataset, DatasetDict):
            self.dataset = self.dataset[split]
        self.labels = list(set(i['Label'] for i in self.dataset))
        self.name = f"{name}-{subset}-{split}"

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return tok(item['Utterance']), item['Label']

In [4]:
import torch 
from torch.utils.data import DataLoader, Dataset

torch.set_num_threads(6)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offset_list = [], [], [0]
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        offset_list.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    offset_list = torch.tensor(offset_list[:-1]).cumsum(dim=0)
    return label_list.to(device), text_list.to(device), offset_list.to(device)

In [65]:
from torch import nn

class TextEmbedder(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TextEmbedder, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.embed_dim = embed_dim
        self.fc1 = nn.Linear(embed_dim, embed_dim)
        self.relu1 = nn.Tanh()
        self.fc2 = nn.Linear(embed_dim, embed_dim)
        self.relu2 = nn.Tanh()
        self.init_weights()

    def init_weights(self):
        initrange = 0.20
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.relu2(self.fc2(self.relu1(self.fc1(embedded))))

class TextClassifier(nn.Module):
    def __init__(self, embedder, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = embedder
        self.fc = nn.Linear(embedder.embed_dim, n_classes)
        self.act = nn.LogSoftmax()
        self.init_weights()

    def init_weights(self):
        initrange = 0.02
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.emb(text, offsets)
        return self.act(self.fc(embedded))

In [66]:
my_datasets = [
    TextDataset('silicone', 'dyda_da'),
#     TextDataset('silicone', 'dyda_e'),
#     TextDataset('silicone', 'meld_e')
]

Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


In [67]:
[d.dataset for d in my_datasets]

[Dataset({
     features: ['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'Label', 'Idx'],
     num_rows: 87170
 })]

In [68]:
models = []

embedding_model = TextEmbedder(vocab_size=tok.vocab_size, embed_dim=256)

for dataset in my_datasets:
    loader = DataLoader(dataset, batch_size=8192, shuffle=True, collate_fn=collate_batch)
    clf = TextClassifier(embedder=embedding_model, n_classes=len(dataset.labels))
    opt = torch.optim.SGD(clf.parameters(), lr=0.01)
    models.append({
        'dataset': dataset, 'loader': loader, 'clf': clf, 'optimizer': opt, 'name': dataset.name
    })

In [69]:
{model['name']: model['dataset'].labels for model in models}

{'silicone-dyda_da-train': [0, 1, 2, 3]}

In [70]:
# len([i for i in models[0]['loader']])

In [72]:
import datetime as dt 
import time 
import torch 
import numpy as np
from rich.console import Console 

console = Console()
criterion = torch.nn.CrossEntropyLoss()

def evaluate(model):
    model['clf'].eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(model['loader']):
            pred = model['clf'](text, offsets)
            total_acc += (pred.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

for epoch in range(50):
    for model in models:
        # indices = np.random.randint(len(model['loader']), size=5)
        for idx, (label, text, offsets) in enumerate(model['loader']):
            #if idx in indices:
            model['optimizer'].zero_grad()
            pred = model['clf'](text, offsets)
            loss = criterion(pred, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model['clf'].parameters(), 0.1)
            model['optimizer'].step()
        print(pred)
    
    console.log({model["name"]: evaluate(model) for model in models})



tensor([[-1.4163, -1.3874, -1.3551, -1.3873],
        [-1.4046, -1.4013, -1.3671, -1.3727],
        [-1.3989, -1.4115, -1.3588, -1.3768],
        ...,
        [-1.4008, -1.3987, -1.3706, -1.3755],
        [-1.4209, -1.4311, -1.3497, -1.3465],
        [-1.3963, -1.3881, -1.3657, -1.3953]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4074, -1.4054, -1.3461, -1.3876],
        [-1.4236, -1.3967, -1.3524, -1.3739],
        [-1.3947, -1.4025, -1.3657, -1.3827],
        ...,
        [-1.4106, -1.4171, -1.3445, -1.3746],
        [-1.4126, -1.4118, -1.3488, -1.3734],
        [-1.4092, -1.3945, -1.3657, -1.3763]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4016, -1.4186, -1.3320, -1.3953],
        [-1.4076, -1.4024, -1.3473, -1.3890],
        [-1.4131, -1.4076, -1.3425, -1.3837],
        ...,
        [-1.4113, -1.3974, -1.3543, -1.3830],
        [-1.4049, -1.4146, -1.3386, -1.3889],
        [-1.4267, -1.4014, -1.3610, -1.3577]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4204, -1.4165, -1.3302, -1.3808],
        [-1.4228, -1.4003, -1.3458, -1.3779],
        [-1.3987, -1.4123, -1.3389, -1.3969],
        ...,
        [-1.4195, -1.4002, -1.3442, -1.3829],
        [-1.4278, -1.4055, -1.3402, -1.3738],
        [-1.4132, -1.4512, -1.3018, -1.3852]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4277, -1.4056, -1.3417, -1.3722],
        [-1.4316, -1.4487, -1.3032, -1.3683],
        [-1.4438, -1.4042, -1.3463, -1.3539],
        ...,
        [-1.4084, -1.4087, -1.3439, -1.3856],
        [-1.4259, -1.4111, -1.3329, -1.3779],
        [-1.4225, -1.4168, -1.3259, -1.3830]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4386, -1.4070, -1.3146, -1.3892],
        [-1.4297, -1.4429, -1.3046, -1.3741],
        [-1.4442, -1.4090, -1.3195, -1.3768],
        ...,
        [-1.4420, -1.4282, -1.3168, -1.3633],
        [-1.4436, -1.4445, -1.2808, -1.3853],
        [-1.4348, -1.4296, -1.3235, -1.3616]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4627, -1.4332, -1.3053, -1.3518],
        [-1.4470, -1.4103, -1.3165, -1.3760],
        [-1.4351, -1.4121, -1.3298, -1.3715],
        ...,
        [-1.4374, -1.4340, -1.2883, -1.3929],
        [-1.4610, -1.4194, -1.3243, -1.3465],
        [-1.4394, -1.4325, -1.3069, -1.3722]], grad_fn=<LogSoftmaxBackward>)


tensor([[-1.4502, -1.4182, -1.3148, -1.3673],
        [-1.4377, -1.4193, -1.3364, -1.3554],
        [-1.4296, -1.4386, -1.2923, -1.3914],
        ...,
        [-1.4424, -1.4219, -1.3377, -1.3473],
        [-1.4406, -1.4256, -1.3164, -1.3675],
        [-1.4536, -1.4238, -1.3401, -1.3331]], grad_fn=<LogSoftmaxBackward>)


KeyboardInterrupt: 