In [3]:
import numpy as np 
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import OneHotEncoder
from tokenwiser.pipeline import make_partial_union

In [4]:
ds = load_dataset('silicone', 'dyda_da')

Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


In [19]:
import pandas as pd 

pd.concat([
    ds['train'].to_pandas()[['Utterance', 'Label']].assign(split="train"),
    ds['validation'].to_pandas()[['Utterance', 'Label']].assign(split="valid")
]).rename(columns={'Utterance': 'text', 'Label': 'label'})

Unnamed: 0,text,label,split
0,"say , jim , how about going for a few beers af...",1,train
1,you know that is tempting but is really not go...,0,train
2,what do you mean ? it will help us to relax .,3,train
3,do you really think so ? i don't . it will jus...,3,train
4,i guess you are right.but what shall we do ? i...,3,train
...,...,...,...
8064,"oh , it must be very precious . is it breakable ?",3,valid
8065,"no , if you take some care when you use them .",2,valid
8066,how much is it ?,3,valid
8067,two thousand .,2,valid


In [50]:
class ClassificationDataset:
    def __init__(self, path, text_col='text', label_col='label'):
        dataf = pd.read_csv(path)
        self.train = dataf.loc[lambda d: d['split'] == 'train'].reset_index()
        self.valid = dataf.loc[lambda d: d['split'] == 'valid'].reset_index()
        self.labels = list(dataf[label_col].unique())
        self.text_col = text_col
        self.label_col = label_col
        self.name = path
    
    def batch(self, n):
        indices = np.random.randint(len(self.train), size=n)
        subset = self.train.iloc[indices]
        return subset[self.text_col], subset[self.label_col]
    
    def full(self, split="train"):
        subset = self.train if split == "train" else self.valid
        return subset[self.text_col], subset[self.label_col]
    

class Batcher:
    def __init__(self, dataset, tokeniser):
        self.dataset = dataset
        self.tokeniser = tokeniser
        label_arr = np.array(self.dataset.labels).reshape(-1, 1)
        self.label_enc = OneHotEncoder(sparse=False).fit(label_arr)
    
    def batch(self, n):
        text, labs = self.dataset.batch(n=n)
        label_arr = np.array(labs).reshape(-1, 1)
        return self.tokeniser.transform(text), self.label_enc.transform(label_arr)
    
    def full(self, split="train"):
        text, labs = self.dataset.full(split=split)
        label_arr = np.array(labs).reshape(-1, 1)
        return self.tokeniser.transform(text), self.label_enc.transform(label_arr)
    
    def transform(self, X):
        return self.tokeniser.transform(text)

In [52]:
n_feat = 20_000

tok = make_partial_union(
    HashingVectorizer(n_features=n_feat), 
    HashingVectorizer(n_features=n_feat, ngram_range=(2, 2))
)

batcher = Batcher(dataset=ClassificationDataset("data/silicone-dyda_da.csv"), tokeniser=tok)
# batcher.batch(100)

In [53]:
class TextDataset(Dataset):
    def __init__(self, name='silicone', subset='dyda_da', split='train', n_feat=20_000):
        self.dataset = load_dataset(name, subset)
        if isinstance(self.dataset, DatasetDict):
            self.dataset = self.dataset[split]
        self.labels = list(set(i['Label'] for i in self.dataset))
        self.name = f"{name}-{subset}-{split}"
        self.tfm = make_partial_union(
            HashingVectorizer(n_features=n_feat), 
            HashingVectorizer(n_features=n_feat, ngram_range=(2, 2))
        )
        self.label_enc = OneHotEncoder(sparse=False).fit(np.array(self.labels).reshape(-1, 1))

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return item['Utterance'], item['Label']

    def __repr__(self):
        return f"<TextDataset {self.name}>"
    
    def batch(self, n):
        """Samples a random batch of `n` datapoints."""
        indices = np.random.randint(len(self), size=n)
        texts, labels = zip(*[self[int(i)] for i in indices])
        X = self.tfm.transform(texts)
        y = self.label_enc.transform(np.array(labels).reshape(-1, 1))
        return X, y
    
    def full(self):
        """Returns the full set in matrix form."""
        texts, labels = zip(*[self[int(i)] for i in range(len(self))])
        X = self.tfm.transform(texts)
        y = self.label_enc.transform(np.array(labels).reshape(-1, 1))
        return X, y
    
    def transform(self, texts):
        return self.tfm.transform(texts)

In [72]:
my_datasets = {d.name: {'dataset': Batcher(d, tokeniser=tok)} for d in [
    ClassificationDataset("data/silicone-dyda_da.csv"),
    ClassificationDataset("data/silicone-dyda_e.csv"), 
    ClassificationDataset("data/silicone-meld_e.csv"),
    ClassificationDataset("data/tweet_eval-emoji.csv"),
    ClassificationDataset("data/tweet_eval-emotion.csv")]
}

```python
fuse = (
    FUSE(tokeniser, n_tok_feat)
      .add_task(name, subset)
      .add_task(name, subset)
      .add_task(name, subset)
)
```

In [73]:
my_datasets

{'data/silicone-dyda_da.csv': {'dataset': <__main__.Batcher at 0x7f3412b45750>},
 'data/silicone-dyda_e.csv': {'dataset': <__main__.Batcher at 0x7f3413222a10>},
 'data/silicone-meld_e.csv': {'dataset': <__main__.Batcher at 0x7f33781f9450>},
 'data/tweet_eval-emoji.csv': {'dataset': <__main__.Batcher at 0x7f33781f9750>},
 'data/tweet_eval-emotion.csv': {'dataset': <__main__.Batcher at 0x7f33d432bd90>}}

In [74]:
from keras.layers import Dense, Input
from keras.models import Model
import scipy
import numpy as np

X = tok.transform(["hello"])
inputs = Input(shape=(X.shape[1],), sparse=True)
emb1 = Dense(256, activation='relu')(inputs)
emb2 = Dense(256, activation='relu')(emb1)

for dataset in my_datasets.values():
    X, y = dataset['dataset'].batch(8)
    dataset['outputs'] = Dense(y.shape[1], activation='softmax')(emb2)
    dataset['model'] = Model(inputs=inputs, outputs=dataset['outputs'])
    dataset['model'].compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [77]:
for epoch in range(100):
    for dataset in my_datasets.keys():
        X, y = my_datasets[dataset]['dataset'].batch(2048)
        print(dataset)
        valid_data = my_datasets[dataset]['dataset'].full(split="valid")
        my_datasets[dataset]['model'].fit(X, y, batch_size=128, validation_data=valid_data, epochs=5)

data/silicone-dyda_da.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-dyda_e.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-meld_e.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/tweet_eval-emoji.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/tweet_eval-emotion.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-dyda_da.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-dyda_e.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-meld_e.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/tweet_eval-emoji.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/tweet_eval-emotion.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-dyda_da.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-dyda_e.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
data/silicone-meld_e.csv
Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [71]:
emb_model = Model(inputs=inputs, outputs=emb2)
pd.DataFrame(emb_model.predict(tok.transform(["bad", "evil", "good", "joy", "happy"]))).T.corr()

Unnamed: 0,0,1,2,3,4
0,1.0,0.608054,0.482035,0.293303,0.231563
1,0.608054,1.0,0.798869,0.687975,0.634797
2,0.482035,0.798869,1.0,0.707324,0.822916
3,0.293303,0.687975,0.707324,1.0,0.816903
4,0.231563,0.634797,0.822916,0.816903,1.0
