In [56]:
from pathlib import Path
import json
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
%matplotlib inline
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter

from torch_geometric.nn import FastRGCNConv, RGCNConv
from torch_geometric.utils import k_hop_subgraph


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [57]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

# Training and test sets of transcription ids
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

possible_speakers = ["ID", "ME", "PM", "UI"]

# we add the 'Beginning' possibility
possible_types = ['Beginning', 'Question-answer_pair', 'Clarification_question', 'Continuation', 'Elaboration', 'Explanation', 'Narration', 'Comment', 'Contrast', 'Conditional', 'Result', 'Acknowledgement', 'Parallel', 'Q-Elab', 'Correction', 'Alternation', 'Background']


y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)

# we collect the useful data independantly for each dialog
dialogs_dataset = []

# we collect the maximal size of a dialog to have only one batch every time
max_batch_size = 0

for transcription_id in training_set:
    utterances = []
    speakers = []
    types = []
    index = []
    transcription_edges = []
    with open(path_to_training / f"{transcription_id}.txt", 'r') as file:
        transcription_edges.append("0 Beginning 0\n")
        for line in file:
            transcription_edges.append(line)
    tmp_types = [d.split(" ")[1] for d in transcription_edges]
    tmp_edges_start = [int(d.split(" ")[0]) for d in transcription_edges]
    tmp_edges_end = [int(d.split(" ")[2][:-1]) for d in transcription_edges]
    edges = [tmp_edges_start, tmp_edges_end]
    
    types_ori = [[1*(t==tmp_t) for t in possible_types] for tmp_t in tmp_types]
    types = [possible_types.index(tmp_t) for tmp_t in tmp_types]

    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        speakers.append([1*(speaker==utterance["speaker"]) for speaker in possible_speakers])
        utterances.append(utterance["text"])
        index.append([utterance["index"]])

    max_batch_size = max(max_batch_size, len(utterances))
    index = np.array(index)/len(index)
    
    y_training.append(training_labels[transcription_id])

    dialogs_dataset.append([utterances, types_ori, types, speakers, index, edges])

In [58]:
X_train, X_val, y_train, y_val = train_test_split(dialogs_dataset, y_training, test_size=0.2)

In [67]:
tokenizer = get_tokenizer("basic_english")

# build vocabulary and sometimes select the most frequent words
words=[]
num_words = 5000

# we tried unsuccessfully to remove the punctuation and the words of one letter
for dialog in X_train:
    for utterance in dialog[0]:
        tokens=tokenizer(utterance)
        # clean_tokens = []
        # for token in tokens:
        #     if(len(token) > 1):
        #         clean_tokens.append(token)
        # words.extend(clean_tokens)
        words.extend(tokens)

for dialog in X_val:
    for utterance in dialog[0]:
        tokens=tokenizer(utterance)
        # clean_tokens = []
        # for token in tokens:
        #     if(len(token) > 1):
        #         clean_tokens.append(token)
        # words.extend(clean_tokens)
        words.extend(tokens)

for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    for utterance in transcription:
        tokens=tokenizer(utterance["text"])
        # clean_tokens = []
        # for token in tokens:
        #     if(len(token) > 1):
        #         clean_tokens.append(token)
        # words.extend(clean_tokens)
        words.extend(tokens)

# tops = dict(Counter(words).most_common(num_words))
tops = dict(Counter(words))

print(tops)
print(len(tops))

vocab = torchtext.vocab.vocab(tops, specials = ['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>']) # default index used when an unknown words is found

print(X_train[0][0][0])
print(vocab.forward(tokenizer(X_train[0][0][0]))) # example of how a sentence is transformed into a sequence of numerical IDs

10310
<vocalsound> Okay .
[2, 3, 4]


In [68]:
max_len = 60

def vectorize_sentences(dialog, max_len):
    vectors=[]
    for utterance in dialog:
        tokens=tokenizer(utterance)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)


trains_X = [vectorize_sentences(dialog_info[0], max_len) for dialog_info in X_train]
trains_X_types_ori = [np.array(dialog_info[1]) for dialog_info in X_train]
trains_X_types = [np.array(dialog_info[2]) for dialog_info in X_train]
trains_X_speakers = [np.array(dialog_info[3]) for dialog_info in X_train]
trains_X_indices = [np.array(dialog_info[4]) for dialog_info in X_train]
vals_X = [vectorize_sentences(dialog_info[0], max_len) for dialog_info in X_val]
vals_X_types_ori = [np.array(dialog_info[1]) for dialog_info in X_val]
vals_X_types = [np.array(dialog_info[2]) for dialog_info in X_val]
vals_X_speakers = [np.array(dialog_info[3]) for dialog_info in X_val]
vals_X_indices = [np.array(dialog_info[4]) for dialog_info in X_val]

edges_train = [np.array(dialog_info[5]) for dialog_info in X_train]
edges_val = [np.array(dialog_info[5]) for dialog_info in X_val]
edges_train_set = [torch.from_numpy(e).to(device) for e in edges_train]
edges_val_set = [torch.from_numpy(e).to(device) for e in edges_val]

trains_y = [np.array(ys).reshape(-1,1) for ys in y_train]
vals_y = [np.array(ys).reshape(-1,1) for ys in y_val]

In [69]:
# define batch size
batch_size = max_batch_size

# create tensor datasets
trainsets = [TensorDataset(torch.from_numpy(trains_X[i]).to(device), 
                           torch.from_numpy(trains_X_types_ori[i]).to(device), 
                           torch.from_numpy(trains_X_types[i]).to(device), 
                           torch.from_numpy(trains_X_speakers[i]).to(device), 
                           torch.from_numpy(trains_X_indices[i]).float().to(device), 
                           torch.from_numpy(trains_y[i]).float().to(device)) for i in range(len(trains_X))]
valsets = [TensorDataset(torch.from_numpy(vals_X[i]).to(device), 
                          torch.from_numpy(vals_X_types_ori[i]).to(device), 
                          torch.from_numpy(vals_X_types[i]).to(device), 
                          torch.from_numpy(vals_X_speakers[i]).to(device), 
                          torch.from_numpy(vals_X_indices[i]).float().to(device), 
                          torch.from_numpy(vals_y[i]).float().to(device)) for i in range(len(vals_X))]

# create dataloaders
train_loaders = [DataLoader(trainset, shuffle=False, batch_size=batch_size) for trainset in trainsets]
val_loaders = [DataLoader(valset, shuffle=False, batch_size=batch_size) for valset in valsets]

In [70]:
def train_model(model, optimizer, loss_criterion, epochs, threshold):
    history_val_f1 = []
    best_val_loss = 1.0
    for epoch in range(epochs):
        avg_loss = 0
        iter = 0
        for i, train_loader in enumerate(train_loaders):
            for (samples, types_ori, types, speakers, index, labels) in train_loader:

                # training mode
                model.train()

                # clear gradients w.r.t. parameters
                optimizer.zero_grad()

                # forward pass to get output/logits
                outputs = model(samples, types_ori, types, speakers, index, edges_train_set[i])

                # calculate Loss: softmax --> cross entropy loss
                loss = loss_criterion(outputs, labels)
                avg_loss += loss.item()
                iter += 1

                # getting gradients w.r.t. parameters
                loss.backward()

                # updating parameters
                optimizer.step()

        avg_loss /= iter

        # calculate F1-score
        model.eval()
        true_labels = []
        predicted_labels = []
        # iterate through the validation loader
        with torch.no_grad():
            val_loss = 0
            iter = 0
            for i, val_loader in enumerate(val_loaders):
                for (samples, types_ori, types, speakers, index, labels) in val_loader:
                    outputs = model(samples, types_ori, types, speakers, index, edges_val_set[i])
                    val_loss += loss_criterion(outputs, labels).item()
                    iter += 1
                    predicted_labels.extend(outputs.numpy())
                    true_labels.extend(labels.numpy())
            val_loss /= iter

        # convert predicted probabilities to binary predictions        
        predicted_labels = [1 if pred > threshold else 0 for pred in predicted_labels]

        # calculate F1 score
        f1 = f1_score(true_labels, predicted_labels)
        print(f'Validation Set Evaluation - F1 Score: {f1}, Average Train Loss: {avg_loss}, Average Val Loss: {val_loss}')

        # append to history
        history_val_f1.append(f1)

        # save model when validation loss is minimal
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # we can load this best model on the validation set later
            torch.save(model.state_dict(), 'best_model_graph3.1.pth')
    return (history_val_f1)



In [71]:
def plot_f1s(f1_values):
    # Set plotting style
    #plt.style.use(('dark_background', 'bmh'))
    plt.style.use('bmh')
    plt.rc('axes', facecolor='none')
    plt.rc('figure', figsize=(16, 4))

    # Plotting loss graph
    plt.plot(f1_values, label='Validation')
    plt.title('Loss Graph')
    plt.legend()
    plt.show()


def look_parameters(mdl):
    weights = torch.Tensor().to(device)
    for param_group in list(mdl.parameters()):
        weights = torch.cat((param_group.view(-1), weights))
    ws = weights.detach().cpu().numpy()
    plt.hist(ws.reshape(-1), range=(-.5, .5), bins=501)

In [72]:
class DialogGNN3(nn.Module):
    def __init__(self, input_dim, embed_dim, output_dim, lstm_dim, node_dim, dropout, hidden_dim_1, hidden_dim_2, hidden_dim_3):
        super(DialogGNN3, self).__init__()

        self.hdim = hidden_dim_1
        
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.embedding_type = nn.Embedding(len(possible_types), 10)
        self.embedding_speaker = nn.Embedding(len(possible_speakers), 3)
        self.lstm1 = nn.LSTM(lstm_dim, node_dim, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)

        self.conv1 = RGCNConv(node_dim, hidden_dim_1, len(possible_types),
                          num_bases=30)
        self.conv2 = RGCNConv(hidden_dim_1, hidden_dim_2, len(possible_types),
                          num_bases=30)

        self.lstm2 = nn.LSTM(node_dim, hidden_dim_3, batch_first=True)
        self.dropout2 = nn.Dropout(dropout)

        self.fc = nn.Linear(hidden_dim_3 + hidden_dim_2, output_dim)


        
    def forward(self, text, type_ori, type, speaker, index, edges):
        embedded = self.embedding(text)
        embedded = F.relu(embedded)

        indices_speaker = torch.argmax(speaker, dim=1)
        indices_type_ori = torch.argmax(type_ori, dim=1)
        embedded_speaker = self.embedding_speaker(indices_speaker)
        embedded_speaker = F.relu(embedded_speaker)
        embedded_type_ori = self.embedding_type(indices_type_ori)
        embedded_type_ori = F.relu(embedded_type_ori)

        speaker_2 = embedded_speaker.unsqueeze(1).expand(-1, max_len, -1)
        type_ori_2 = embedded_type_ori.unsqueeze(1).expand(-1, max_len, -1)
        index_2 = index.unsqueeze(1).expand(-1, max_len, -1)

        x = torch.cat((embedded, type_ori_2, speaker_2, index_2), dim=2)
        lstm_out, (hidden, cell) = self.lstm1(x)
        # lstm_out = self.dropout1(lstm_out)

        x1 = lstm_out[:, -1, :]
        x1 = self.conv1(x1, edges, type)
        x1 = F.relu(x1)
        x1 = self.conv2(x1, edges, type)
        x1 = F.relu(x1)

        x2, (hidden, cell) = self.lstm2(lstm_out)
        x2 = x2[:, -1, :]
        x2 = F.relu(x2)
        # x2 = self.dropout2(x2)

        xf = torch.cat((x1, x2), dim=1)
        xf = self.fc(xf)

        output = torch.sigmoid(xf)
        
        return output

In [75]:
input_dimension = len(tops) + 2 #add 2 for <unk> and <pad> symbols
embedding_dimension = 114
output_dimension = 1
num_epochs = 1000
l_rate = 0.0001
lstm_dimension = embedding_dimension + 10 + 3 + 1
node_dimension = 100
thresh = 0.3
hidden_dimension_1 = 128
hidden_dimension_2 = 128
hidden_dimension_3 = 64
dropout_rate = 0.2

gnn = DialogGNN3(input_dimension, embedding_dimension, output_dimension, lstm_dimension, node_dimension, dropout_rate, hidden_dimension_1,
                  hidden_dimension_2, hidden_dimension_3)
gnn.to(device)

criterion_fx = nn.BCELoss()

optimizer_fx = torch.optim.Adam(gnn.parameters(), lr=l_rate, weight_decay=5e-4)
# optimizer_fx = torch.optim.Adam(gnn.parameters(), lr=l_rate)

val_f1 = train_model(gnn, optimizer_fx, criterion_fx, num_epochs, thresh)

plot_f1s(val_f1)

Validation Set Evaluation - F1 Score: 0.3910054425665998, Average Train Loss: 0.5020439930550464, Average Val Loss: 0.46062080562114716
Validation Set Evaluation - F1 Score: 0.3988629563135847, Average Train Loss: 0.4279401685510363, Average Val Loss: 0.4573647066950798
Validation Set Evaluation - F1 Score: 0.40327533265097243, Average Train Loss: 0.4260742261812284, Average Val Loss: 0.45660448223352434
Validation Set Evaluation - F1 Score: 0.4015645371577575, Average Train Loss: 0.42525726556777954, Average Val Loss: 0.4563148692250252
Validation Set Evaluation - F1 Score: 0.4012692917928746, Average Train Loss: 0.4248114015374865, Average Val Loss: 0.45610710680484773
Validation Set Evaluation - F1 Score: 0.39947780678851175, Average Train Loss: 0.4245259448305353, Average Val Loss: 0.4559881046414375
Validation Set Evaluation - F1 Score: 0.39930151338766007, Average Train Loss: 0.4242931298621289, Average Val Loss: 0.4559276461601257
Validation Set Evaluation - F1 Score: 0.39953474

In [23]:
best_model = DialogGNN3(input_dimension, embedding_dimension, output_dimension, lstm_dimension, node_dimension, dropout_rate, hidden_dimension_1, 
                        hidden_dimension_2, hidden_dimension_3)
best_model.load_state_dict(torch.load('best_model_graph3.1.pth'))

test_labels = {}

for transcription_id in test_set:
    utterances = []
    speakers = []
    types = []
    index = []
    transcription_edges = []
    with open(path_to_test / f"{transcription_id}.txt", 'r') as file:
        transcription_edges.append("0 Beginning 0\n")
        for line in file:
            transcription_edges.append(line)
    tmp_types = [d.split(" ")[1] for d in transcription_edges]
    tmp_edges_start = [int(d.split(" ")[0]) for d in transcription_edges]
    tmp_edges_end = [int(d.split(" ")[2][:-1]) for d in transcription_edges]
    edges = [tmp_edges_start, tmp_edges_end]
    
    types_ori = [[1*(t==tmp_t) for t in possible_types] for tmp_t in tmp_types]
    types = [possible_types.index(tmp_t) for tmp_t in tmp_types]

    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        speakers.append([1*(speaker==utterance["speaker"]) for speaker in possible_speakers])
        utterances.append(utterance["text"])
        index.append([utterance["index"]])

    index = np.array(index)/len(index)
    max_batch_size = max(max_batch_size, len(utterances))
    dialog_dataset = [utterances, types_ori, types, speakers, index, edges]


    tests_data_X = vectorize_sentences(dialog_dataset[0], max_len)
    tests_data_X_types_ori = np.array(dialog_dataset[1])
    tests_data_X_types = np.array(dialog_dataset[2])
    tests_data_X_speakers = np.array(dialog_dataset[3])
    tests_data_X_indices = np.array(dialog_dataset[4])

    edges_test_test = np.array(dialog_dataset[5])
    edges_test_set_test = torch.from_numpy(edges_test_test).to(device)

    test_dataset = TensorDataset(torch.from_numpy(tests_data_X).to(device), 
                                 torch.from_numpy(tests_data_X_types_ori).to(device), 
                                 torch.from_numpy(tests_data_X_types).to(device), 
                                 torch.from_numpy(tests_data_X_speakers).to(device), 
                                 torch.from_numpy(tests_data_X_indices).float().to(device))

    test_data_loader = DataLoader(test_dataset, shuffle=False, batch_size=max_batch_size)

    best_model.eval()
    predicted_labels = []
    with torch.no_grad():
        for (samples, types_ori, types, speakers, index) in test_data_loader:
            outputs = best_model(samples, types_ori, types, speakers, index, edges_test_set_test)
            predicted_labels.extend(outputs.numpy())

    predicted_labels = [1 if pred > thresh else 0 for pred in predicted_labels]
    test_labels[transcription_id] = predicted_labels

with open("test_labels_graph3_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

