In [1]:
import numpy as np 
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import OneHotEncoder
from tokenwiser.pipeline import make_partial_union

In [2]:
class TextDataset(Dataset):
    def __init__(self, name='silicone', subset='dyda_da', split='train', n_feat=20_000):
        self.dataset = load_dataset(name, subset)
        if isinstance(self.dataset, DatasetDict):
            self.dataset = self.dataset[split]
        self.labels = list(set(i['Label'] for i in self.dataset))
        self.name = f"{name}-{subset}-{split}"
        self.tfm = tfm = make_partial_union(
            HashingVectorizer(n_features=n_feat), 
            HashingVectorizer(n_features=n_feat, ngram_range=(2, 2))
        )
        self.label_enc = OneHotEncoder(sparse=False).fit(np.array(self.labels).reshape(-1, 1))

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return item['Utterance'], item['Label']
    
    def batch(self, n):
        indices = np.random.randint(len(self), size=n)
        texts, labels = zip(*[self[int(i)] for i in indices])
        X = self.tfm.transform(texts)
        y = self.label_enc.transform(np.array(labels).reshape(-1, 1))
        return X, y

In [3]:
data = TextDataset('silicone', 'dyda_da')

Reusing dataset silicone (/home/vincent/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


In [4]:
len(data)

87170

```python
fuse = (
    FUSE(tokeniser, n_tok_feat)
      .add_task(name, subset)
      .add_task(name, subset)
      .add_task(name, subset)
)
```

In [8]:
from keras.layers import Dense, Input
from keras.models import Model
import scipy
import numpy as np

X, y = data.batch(1)
inputs = Input(shape=(X.shape[1],), sparse=True)
emb1 = Dense(256, activation='relu')(inputs)
emb2 = Dense(256, activation='relu')(emb1)

outputs = Dense(y.shape[1], activation='softmax')(emb2)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
for i in range(100):
    X, y = data.batch(2048)
    model.fit(X, y, batch_size=64)

  "shape. This may consume a large amount of memory." % value)


 1/32 [..............................] - ETA: 14s - loss: 1.3882 - accuracy: 0.3281

2021-09-03 22:56:46.558418: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


