In [1]:
from transformers import pipeline
#https://huggingface.co/diwank/silicone-deberta-pair
model = "diwank/silicone-deberta-pair"
tokenizer = "diwank/silicone-deberta-pair"
pip = pipeline('text-classification', model=model)

In [2]:
# (0, 'acknowledge')
# (1, 'answer')
# (2, 'backchannel')
# (3, 'reply_yes')
# (4, 'exclaim')
# (5, 'say')
# (6, 'reply_no')
# (7, 'hold')
# (8, 'ask')
# (9, 'intent')
# (10, 'ask_yes_no')

In [3]:
sentences = ["I love cats", "Do you love cats?", "Yes, I do", "No, dogs"]

In [4]:
response = pip(sentences)
print(response)

[{'label': 'LABEL_5', 'score': 0.7644516229629517}, {'label': 'LABEL_8', 'score': 0.9776815176010132}, {'label': 'LABEL_3', 'score': 0.5020651817321777}, {'label': 'LABEL_5', 'score': 0.5964227318763733}]


In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

import numpy as np
from tqdm import tqdm
import pickle

In [None]:
class DialogTag:
    def __init__(self, num_labels=23):
        # Set up GPU if available
        self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self._tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self._model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
        self._model.to(self._device)

        self._label2id = dict()
        self._id2label = dict()

    def _tokenize(self, strings):
        return self._tokenizer(strings, padding=True, return_tensors='pt').to(self._device)

    def _encode_labels(self, labels):
        int_labels = []
        for label in labels:
            if label not in self._label2id:
                self._label2id[label] = len(self._label2id)
                self._id2label[len(self._id2label)] = label
            int_labels.append(self._label2id[label])
        return torch.LongTensor(int_labels).to(self._device)

    def fit(self, data, epochs=4, batch_size=32, lrate=1e-5):
        # Preprocess turns and index labels
        strings = [t0 + self._tokenizer.sep_token + t1 for t0, t1, _ in data]
        labels = [l for _, _, l in data]

        X = [self._tokenize(strings[i:i + batch_size]) for i in range(0, len(strings), batch_size)]
        y = [self._encode_labels(labels[i:i + batch_size]) for i in range(0, len(labels), batch_size)]

        # Setup optimizer and objective function
        optimizer = torch.optim.Adam(self._model.parameters(), lr=lrate)
        criterion = torch.nn.CrossEntropyLoss()

        for epoch in range(epochs):
            losses = []

            for X_batch, y_batch in tqdm(zip(X, y)):
                y_pred = self._model(**X_batch)
                loss = criterion(y_pred.logits, y_batch)
                losses.append(loss.item())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            print(np.mean(losses))

    def predict(self, turn0, turn1):
        string = turn0 + self._tokenizer.sep_token + turn1
        X = self._tokenize([string])
        y = self._model(**X).logits.cpu().detach().numpy()
        return self._id2label[np.argmax(y[0])]

## MIDAS Dialogue Act

In [8]:
%pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Using cached pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer

In [10]:
#https://github.com/DianDYu/MIDAS_dialog_act
# 23 labels

NOTE_DA_MAP = {"sd": "statement", 
               "b": "back-channeling", 
               "sv": "opinion", 
               "ny": "pos_answer", 
               "%": "abandon",
               "ba": "appreciation",
               "qy": "yes_no_question", 
               "fc": "closing", 
               "ng": "neg_answer", 
               "h": "other_answers", 
               "o": "other",
               "sv": "opinion", 
               "ad": "command", 
               "^h": "hold", 
               "cp": "complaint", 
               "fp": "opening",
               "bd": "respond_to_apology",
               "fa": "apology", 
               "ft": "thanking",
               "oqf": "open_question_factual", 
               "oqo": "open_question_opinion", 
               "cm": "comment",
               "ns": "nonsense", 
               "dad": "dev_command"
}

In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model_path = "src/cltl/models/midas-da-bert"
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
              cache_dir=model_path,
              num_labels = 23)


 56%|████████████████████████████████████████████████████████████████████████████▉                                                             | 227534848/407873900 [01:00<00:39, 4515960.39B/s]

KeyboardInterrupt: 

In [21]:
model_path = "src/cltl/models/midas-da-bert"
pip_midas = pipeline('text-classification', model=model_path)

ValueError: Could not load model src/cltl/models/midas-da-bert with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForSequenceClassification'>).

In [None]:
python run_classifier.py --data_dir da_data/ --bert_model output --task_name da --output_dir output --do_eval --binary_pred