In [10]:
import pandas as pd
import pyarrow as pa
import datasets as ds
from datasets import load_metric
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer


from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim


In [3]:
# Setup visible GPUs, change as appropriate per available GPU systems
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2,3,4,5,6,7"

In [4]:
dataset_path = '/scratch/hle/git/MASSIVE'

In [5]:
train_dataset = ds.load_from_disk(os.path.join(dataset_path, '.train'))


In [6]:
train_domains = []
for i in train_dataset:
    if i['domain'] not in train_domains:
        train_domains.append(i['domain'])

In [16]:
def label_mapping(domains):
    domains.sort()
    key_to_val = {k:v for k,v in enumerate(domains)}
    val_to_key = {v:k for k,v in enumerate(domains)}
    return key_to_val, val_to_key

In [17]:
key_to_val, val_to_key = label_mapping(train_domains)

In [19]:
key_to_val

{0: 'alarm',
 1: 'audio',
 2: 'calendar',
 3: 'cooking',
 4: 'datetime',
 5: 'email',
 6: 'general',
 7: 'iot',
 8: 'lists',
 9: 'music',
 10: 'news',
 11: 'play',
 12: 'qa',
 13: 'recommendation',
 14: 'social',
 15: 'takeaway',
 16: 'transport',
 17: 'weather'}

### Getting sentence embedding with sentence transformer

In [11]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [12]:
embd_a = model.encode("What is your age?")
embd_b = model.encode("How old are you?")


sim_score = cos_sim(embd_a, embd_b)

print(sim_score)

tensor([[0.9356]])


In [13]:
embd_a.shape

(384,)

In [15]:
embeddings[0].shape

(384,)

In [53]:
def embedding_function(examples):
    utts = examples['utt']
    sentences = [' '.join(utt) for utt in utts]
    return {'embeddings': model.encode(sentences)}


In [54]:
train_dataset_with_embedding = train_dataset.map(embedding_function, batched=True, batch_size=1000)


  0%|          | 0/588 [00:00<?, ?ba/s]

### save data with embeddings to file

In [73]:
train_dataset_with_embedding.save_to_disk('/scratch/hle/git/')