## Connecting to Drive

In [70]:
from google.colab import drive

# Pilot annotations were stored in Google Drive in Data/pilot_annotations
drive.mount('/content/gdrive')

model_dir = '/content/gdrive/MyDrive/Thesis MSc AI VU - Thomas Bellucci/tests/models'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Training Dialogue Act Tagger on MIDAS

In [54]:
import pandas as pd

url = 'https://raw.githubusercontent.com/DianDYu/MIDAS_dialog_act/throw_exception_on_example_format_error/da_data/train.txt'
raw_data = pd.read_csv(url, sep='\n', header=None).values.flatten().tolist()
raw_data[:3]

['how about another short piece of football news : EMPTY > how can you pick us knows now ## open_question_factual;',
 'do you want to hear some fun facts about cats instead : EMPTY > yes ## pos_answer;command',
 'did you know that : yes > i did ## pos_answer;']

In [58]:
import re

def split_line(line):
    items = re.split(' : | > | ## ', line.strip())
    if len(items) != 4:
        return []
    turn0, _, turn1, acts = items
    acts = acts.strip().split(';')
    return [(turn0, turn1, act) for act in acts if act.strip() != '']

data = []
for line in raw_data:
    for item in split_line(line):
        data.append(item)
data[:3]

[('how about another short piece of football news',
  'how can you pick us knows now',
  'open_question_factual'),
 ('do you want to hear some fun facts about cats instead',
  'yes',
  'pos_answer'),
 ('do you want to hear some fun facts about cats instead', 'yes', 'command')]

In [59]:
len(data)

10927

In [60]:
tags = set([t for _, _, t in data])
tags

{'abandon',
 'apology',
 'appreciation',
 'back-channeling',
 'closing',
 'command',
 'comment',
 'complaint',
 'dev_command',
 'hold',
 'neg_answer',
 'nonsense',
 'open_question_factual',
 'open_question_opinion',
 'opening',
 'opinion',
 'other',
 'other_answers',
 'pos_answer',
 'respond_to_apology',
 'statement',
 'thanking',
 'yes_no_question'}

## Training RoBERTa on MIDAS

In [61]:
%%capture
!pip install transformers

import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

import numpy as np
from tqdm import tqdm
import pickle

In [62]:
class DialogTag:
    def __init__(self, num_labels=23):
        # Set up GPU if available
        self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self._tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self._model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
        self._model.to(self._device)

        self._label2id = dict()
        self._id2label = dict()

    def _tokenize(self, strings):
        return self._tokenizer(strings, padding=True, return_tensors='pt').to(self._device)

    def _encode_labels(self, labels):
        int_labels = []
        for label in labels:
            if label not in self._label2id:
                self._label2id[label] = len(self._label2id)
                self._id2label[len(self._id2label)] = label
            int_labels.append(self._label2id[label])
        return torch.LongTensor(int_labels).to(self._device)

    def fit(self, data, epochs=4, batch_size=32, lrate=1e-5):
        # Preprocess turns and index labels
        strings = [t0 + self._tokenizer.sep_token + t1 for t0, t1, _ in data]
        labels = [l for _, _, l in data]

        X = [self._tokenize(strings[i:i + batch_size]) for i in range(0, len(strings), batch_size)]
        y = [self._encode_labels(labels[i:i + batch_size]) for i in range(0, len(labels), batch_size)]

        # Setup optimizer and objective function
        optimizer = torch.optim.Adam(self._model.parameters(), lr=lrate)
        criterion = torch.nn.CrossEntropyLoss()

        for epoch in range(epochs):
            losses = []

            for X_batch, y_batch in tqdm(zip(X, y)):
                y_pred = self._model(**X_batch)
                loss = criterion(y_pred.logits, y_batch)
                losses.append(loss.item())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            print(np.mean(losses))

    def predict(self, turn0, turn1):
        string = turn0 + self._tokenizer.sep_token + turn1
        X = self._tokenize([string])
        y = self._model(**X).logits.cpu().detach().numpy()
        return self._id2label[np.argmax(y[0])]


In [None]:
dt = DialogTag()
dt.fit(data)

with open(model_dir + '/DialogTag.pkl', 'wb') as file:
    pickle.dump(dt, file)

342it [02:00,  2.84it/s]


1.4480896588654546


342it [02:00,  2.83it/s]


0.8076535619316045


121it [00:42,  3.40it/s]

In [64]:
dt.predict('hello', 'hi')

'opening'

## Evaluation

In [67]:
url_test = 'https://raw.githubusercontent.com/DianDYu/MIDAS_dialog_act/throw_exception_on_example_format_error/da_data/dev.txt'
raw_test_data = pd.read_csv(url_test, sep='\n', header=None).values.flatten().tolist()

test_data = []
for line in raw_test_data:
    for item in split_line(line):
        test_data.append(item)
test_data[:3]

[('i think i heard wrong', 'top gun', 'statement'),
 ("don't give up on me", "i'm not giving up on you", 'statement'),
 ('do you mind tell me one more time',
  'who is the piano',
  'open_question_factual')]

In [68]:
true_labels = [l[2] for l in test_data]
pred_labels = []
for t0, t1, _ in tqdm(test_data):
    pred_labels.append(dt.predict(t0, t1))

acc = np.mean([1 if y0 == y1 else 0 for y0, y1 in zip(true_labels, pred_labels)])
acc

100%|██████████| 2529/2529 [00:39<00:00, 63.68it/s]


0.7030446816923686