In [1]:
import pathlib
import pandas as pd

In [2]:
from datasets import load_dataset, DatasetDict

In [3]:
from torchtext.data.utils import get_tokenizer

In [4]:
class Tokenizer:
    def __init__(self, hash_size, n_hashes):
        self.hash_size = hash_size 
        self.n_hashes = n_hashes
        self.splitter = get_tokenizer('basic_english')
        self.vocab_size = hash_size * n_hashes
    
    def __call__(self, text):
        integers = []
        words = self.splitter(text)
        for h in range(self.n_hashes):
            integers += [hash(w + 'a'*h) % self.hash_size + h*self.hash_size for w in words]
        return integers
    
tok = Tokenizer(hash_size=200, n_hashes=4)
tok("hello there")

[184, 110, 333, 343, 424, 437, 721, 790]

In [29]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, name='silicone', subset='dyda_da', split='train'):
        self.dataset = load_dataset(name, subset)
        if isinstance(self.dataset, DatasetDict):
            self.dataset = self.dataset[split]
        self.labels = list(set(i['Label'] for i in self.dataset))
        self.name = f"{name}-{subset}-{split}"

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return tok(item['Utterance']), item['Label']

In [30]:
import torch 
from torch.utils.data import DataLoader, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offset_list = [], [], [0]
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        offset_list.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    offset_list = torch.tensor(offset_list[:-1]).cumsum(dim=0)
    return label_list.to(device), text_list.to(device), offset_list.to(device)

In [31]:
from torch import nn

class TextEmbedder(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TextEmbedder, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.embed_dim = embed_dim
        self.fc1 = nn.Linear(embed_dim, embed_dim)
        self.fc2 = nn.Linear(embed_dim, embed_dim)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc1(self.fc1(embedded))

class TextClassifier(nn.Module):
    def __init__(self, embedder, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = embedder
        self.fc = nn.Linear(embedder.embed_dim, n_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.emb(text, offsets)
        return self.fc(embedded)

In [32]:
my_datasets = [
    TextDataset('silicone', 'dyda_da'),
    TextDataset('silicone', 'dyda_e'),
    TextDataset('silicone', 'meld_e')
]

Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/dyda_e/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/meld_e/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


In [33]:
[d.dataset for d in my_datasets]

[Dataset({
     features: ['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'Label', 'Idx'],
     num_rows: 87170
 }),
 Dataset({
     features: ['Utterance', 'Emotion', 'Dialogue_ID', 'Label', 'Idx'],
     num_rows: 87170
 }),
 Dataset({
     features: ['Utterance', 'Speaker', 'Emotion', 'Dialogue_ID', 'Utterance_ID', 'Label', 'Idx'],
     num_rows: 9989
 })]

In [38]:
models = []

embedding_model = TextEmbedder(vocab_size=tok.vocab_size, embed_dim=25)

for dataset in my_datasets:
    loader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=collate_batch)
    clf = TextClassifier(embedder=embedding_model, n_classes=len(dataset.labels))
    opt = torch.optim.SGD(clf.parameters(), lr=0.001)
    models.append({
        'dataset': dataset, 'loader': loader, 'clf': clf, 'optimizer': opt, 'name': dataset.name
    })

In [39]:
[m['name'] for m in models]

['silicone-dyda_da-train', 'silicone-dyda_e-train', 'silicone-meld_e-train']

In [45]:
len([len(i[0]) for i in model['loader']])

341

In [36]:
import datetime as dt 
import time 
import torch 

criterion = torch.nn.CrossEntropyLoss()

def evaluate(model):
    model['clf'].eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(model['loader']):
            pred = model['clf'](text, offsets)
            loss = criterion(pred, label)
            total_acc += (pred.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


for epoch in range(50):
    start_time = time.time()
    log_interval = 10
    
    for model in models:
        for idx, (label, text, offsets) in enumerate(model['loader']):
            model['optimizer'].zero_grad()
            pred = model['clf'](text, offsets)
            loss = criterion(pred, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model['clf'].parameters(), 0.1)
            model['optimizer'].step()
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(f"{dt.datetime.now()} - {round(elapsed)}s - {model['dataset'].name} - {epoch}")
            start_time = time.time()

2021-08-27 17:54:01.364481 - 9.611907005310059 - silicone-dyda_da-train
2021-08-27 17:54:10.959883 - 9.59531044960022 - silicone-dyda_e-train
2021-08-27 17:54:21.635354 - 9.559451580047607 - silicone-dyda_da-train
2021-08-27 17:54:31.148694 - 9.513250350952148 - silicone-dyda_e-train
2021-08-27 17:54:41.841879 - 9.570300579071045 - silicone-dyda_da-train
2021-08-27 17:54:51.379153 - 9.537185430526733 - silicone-dyda_e-train
2021-08-27 17:55:02.484405 - 9.988139629364014 - silicone-dyda_da-train
2021-08-27 17:55:12.435085 - 9.950585126876831 - silicone-dyda_e-train
2021-08-27 17:55:23.256850 - 9.645428895950317 - silicone-dyda_da-train
2021-08-27 17:55:33.031660 - 9.77453088760376 - silicone-dyda_e-train
2021-08-27 17:55:43.756710 - 9.611250638961792 - silicone-dyda_da-train
2021-08-27 17:55:53.373357 - 9.616554260253906 - silicone-dyda_e-train
2021-08-27 17:56:04.159429 - 9.660074949264526 - silicone-dyda_da-train


KeyboardInterrupt: 