In [57]:
from gensim.models import Doc2Vec
from pathlib import Path
import json
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument
from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

import collections
import random
import pandas as pd

import torchtext
from torchtext.data import get_tokenizer
from collections import Counter

import scipy.sparse as sp


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [58]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

# Training and test sets of transcription ids
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

possible_speakers = ["ID", "ME", "PM", "UI"]

# we added the 'End' possibility
possible_types = ['Beginning', 'Question-answer_pair', 'Clarification_question', 'Continuation', 'Elaboration', 'Explanation', 'Narration', 'Comment', 'Contrast', 'Conditional', 'Result', 'Acknowledgement', 'Parallel', 'Q-Elab', 'Correction', 'Alternation', 'Background']


y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)

dialogs_dataset = []

for transcription_id in training_set:
    utterances = []
    speakers = []
    types = []
    index = []
    transcription_edges = []
    with open(path_to_training / f"{transcription_id}.txt", 'r') as file:
        transcription_edges.append("-1 Beginning 0")
        for line in file:
            transcription_edges.append(line)
    tmp_types = [d.split(" ")[1] for d in transcription_edges]

    types = [[1*(t==tmp_t) for t in possible_types] for tmp_t in tmp_types]

    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        speakers.append([1*(speaker==utterance["speaker"]) for speaker in possible_speakers])
        utterances.append(utterance["text"])
        index.append([utterance["index"]])

    index = np.array(index)/len(index)

    y_training.append(training_labels[transcription_id])

    dialogs_dataset.append([utterances, types, speakers, index])


In [59]:
X_train, X_test, y_train, y_test = train_test_split(dialogs_dataset, y_training, test_size=0.2)

print(len(dialogs_dataset), len(X_train), len(X_test))
print(len(X_train[0][0]), len(X_train[0][1]), len(X_train[0][2]), len(X_train[0][3]))

print(len(X_train[0][0][0]), len(X_train[0][1][0]), len(X_train[0][2][0]), len(X_train[0][3][0]))

97 77 20
806 806 806 806
28 17 4 1


In [60]:
tokenizer = get_tokenizer("basic_english")

# Build vocabulary
words=[]
# num_words = 2000

for dialog in X_train:
    for utterance in dialog[0]:
        tokens=tokenizer(utterance)
        # clean_tokens = []
        # for token in tokens:
        #     if(len(token) > 1):
        #         clean_tokens.append(token)
        # words.extend(clean_tokens)
        words.extend(tokens)

tops = dict(Counter(words))
# tops = dict(Counter(words).most_common(num_words))

print(tops)
print(len(tops))

vocab = torchtext.vocab.vocab(tops, specials = ['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>']) #default index used when an unknown words is found

print(X_train[0][0][0])
print(vocab.forward(tokenizer(X_train[0][0][0]))) #example of how a sentence is transformed into a sequence of numerical IDs

7823
All hooked up . <vocalsound>
[2, 3, 4, 5, 6]


In [61]:
max_len=80

def vectorize_sentences(dialog, max_len):
    vectors=[]
    for utterance in dialog:
        tokens=tokenizer(utterance)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)


trains_X = [vectorize_sentences(dialog_info[0], max_len) for dialog_info in X_train]
trains_X_types = [np.array(dialog_info[1]) for dialog_info in X_train]
trains_X_speakers = [np.array(dialog_info[2]) for dialog_info in X_train]
trains_X_indices = [np.array(dialog_info[3]) for dialog_info in X_train]
tests_X = [vectorize_sentences(dialog_info[0], max_len) for dialog_info in X_test]
tests_X_types = [np.array(dialog_info[1]) for dialog_info in X_test]
tests_X_speakers = [np.array(dialog_info[2]) for dialog_info in X_test]
tests_X_indices = [np.array(dialog_info[3]) for dialog_info in X_test]

trains_y = [np.array(ys).reshape(-1,1) for ys in y_train]
tests_y = [np.array(ys).reshape(-1,1) for ys in y_test]
print(trains_X[0])
# print(trains_y[0])

[[  2   3   4 ...   1   1   1]
 [  7   8   9 ...   1   1   1]
 [ 10  11  12 ...   1   1   1]
 ...
 [  9  39 126 ...   1   1   1]
 [ 22 354  11 ...   1   1   1]
 [  7   8  64 ...   1   1   1]]


In [62]:
# define batch size
batch_size = 64

# normalization
# for i in range(len(trains_X)):
#     mx = trains_X[i].astype(np.float32)
#     rowsum = np.array(mx.sum(1))
#     r_inv = np.power(rowsum, -1).flatten()
#     r_inv[np.isinf(r_inv)] = 0.
#     r_mat_inv = sp.diags(r_inv)
#     mx = r_mat_inv.dot(mx)
#     trains_X[i] = mx.astype(np.int64)

# create tensor datasets
trainsets = [TensorDataset(torch.from_numpy(trains_X[i]).to(device), 
                           torch.from_numpy(trains_X_types[i]).to(device), 
                           torch.from_numpy(trains_X_speakers[i]).to(device), 
                           torch.from_numpy(trains_X_indices[i]).float().to(device), 
                           torch.from_numpy(trains_y[i]).float().to(device)) for i in range(len(trains_X))]
testsets = [TensorDataset(torch.from_numpy(tests_X[i]).to(device), 
                          torch.from_numpy(tests_X_types[i]).to(device), 
                          torch.from_numpy(tests_X_speakers[i]).to(device), 
                          torch.from_numpy(tests_X_indices[i]).float().to(device), 
                          torch.from_numpy(tests_y[i]).float().to(device)) for i in range(len(tests_X))]

# create dataloaders
train_loaders = [DataLoader(trainset, shuffle=False, batch_size=batch_size) for trainset in trainsets]
test_loaders = [DataLoader(testset, shuffle=False, batch_size=batch_size) for testset in testsets]

In [63]:
def train_model(model, optimizer, loss_criterion, epochs, threshold):
    history_val_f1 = []
    best_f1 = 0
    for epoch in range(epochs):
        avg_loss = 0
        iter = 0
        for train_loader in train_loaders:
            for (samples, types, speakers, index, labels) in train_loader:
                # Training mode
                model.train()

                # Load samples
                # samples = samples.view(-1, max_len).to(device)
                # labels = labels.view(-1, 1).to(device)

                # Clear gradients w.r.t. parameters
                optimizer.zero_grad()

                # Forward pass to get output/logits
                outputs = model(samples, types, speakers, index)
                
                # print("samples", samples)
                # print("output", outputs)
                # print("labels", labels)

                # Calculate Loss: softmax --> cross entropy loss
                # print("samples", samples.shape)
                # print("outputs", outputs.shape)
                # print("labels", labels.shape)
                
                loss = loss_criterion(outputs, labels)
                avg_loss += loss.item()
                iter += 1

                # Getting gradients w.r.t. parameters
                loss.backward()

                # Updating parameters
                optimizer.step()

        avg_loss /= iter
        # Calculate F1-score
        model.eval()
        true_labels = []
        predicted_labels = []
        # Iterate through the validation loader
        with torch.no_grad():
            for test_loader in test_loaders:
                for (samples, types, speakers, index, labels) in test_loader:
                    outputs = model(samples, types, speakers, index)
                    predicted_labels.extend(outputs.numpy())
                    true_labels.extend(labels.numpy())

        # Convert predicted probabilities to binary predictions
        # print(predicted_labels)
        
        predicted_labels = [1 if pred > threshold else 0 for pred in predicted_labels]

        # Calculate F1 score
        f1 = f1_score(true_labels, predicted_labels)
        print(f'Validation Set Evaluation - F1 Score: {f1}, Average Loss: {avg_loss}')
        # Append to history
        history_val_f1.append(f1)

        # Save model when accuracy beats best accuracy
        if f1 > best_f1:
            best_f1 = f1
            # We can load this best model on the validation set later
            torch.save(model.state_dict(), 'best_model.pth')
    print("Best f1: ", best_f1)
    return (history_val_f1)



In [64]:
def plot_f1s(f1_values):
    # Set plotting style
    #plt.style.use(('dark_background', 'bmh'))
    plt.style.use('bmh')
    plt.rc('axes', facecolor='none')
    plt.rc('figure', figsize=(16, 4))

    # Plotting loss graph
    plt.plot(f1_values, label='Validation')
    plt.title('Loss Graph')
    plt.legend()
    plt.show()

def look_parameters(mdl):
    weights = torch.Tensor().to(device)
    for param_group in list(mdl.parameters()):
        weights = torch.cat((param_group.view(-1), weights))
    ws = weights.detach().cpu().numpy()
    plt.hist(ws.reshape(-1), range=(-.5, .5), bins=501)

In [65]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, embed_dim, lstm_dim, hidden_dim, num_class):
        """
        vocab_size: (int) size of the vocabulary - required by embeddings
        embed_dim: (int) size of embeddings
        hidden_dim: (int) number of hidden units
        num_class: (int) number of classes
        """
        super().__init__()
        self.hidden_dim=hidden_dim
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.embedding_type = nn.Embedding(len(possible_types), 8)
        self.embedding_speaker = nn.Embedding(len(possible_speakers), 2)
        #enter here your code
        self.lstm = nn.LSTM(lstm_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(2*hidden_dim, num_class)
        
       

    def forward(self, text, type, speaker, index):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """

        # print("text", text.shape, text)

        embedded = self.embedding(text)
        embedded = torch.relu(embedded)

        # print("embedded", embedded.shape)
        # print("speaker", speaker.shape)
        # print("type", type.shape)

        indices_type = torch.argmax(type, dim=1)
        indices_speaker = torch.argmax(speaker, dim=1)

        embedded_type = self.embedding_type(indices_type)
        embedded_speaker = self.embedding_speaker(indices_speaker)
        # print("embedded_type", embedded_type.shape)
        # print("embedded_speaker", embedded_speaker.shape)

        speaker_2 = embedded_speaker.unsqueeze(1).expand(-1, max_len, -1)  # Change shape from [64, 2] to [64, 80, 2]
        type_2 = embedded_type.unsqueeze(1).expand(-1, max_len, -1)  # Change shape from [64, 8] to [64, 80, 8]
        index_2 = index.unsqueeze(1).expand(-1, max_len, -1)  # Change shape from [64, 1] to [64, 80, 1]

        x = torch.cat((embedded, speaker_2, type_2, index_2), dim=2)
        lstm_out, (hidden, cell) = self.lstm(x)
        # out = hidden.view(-1, self.hidden_dim)
        lstm_out = lstm_out[:, -1, :]
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = torch.sigmoid(out)
        return out

In [66]:
input_dimension = len(tops) + 2 #add 2 for <unk> and <pad> symbols
embedding_dimension = 200
hidden_dimension = 128
output_dimension = 1
num_epochs = 100
l_rate = 0.0001
lstm_dimension = embedding_dimension + 8 + 2 + 1
thresh = 0.4

lstm = LSTMModel(input_dimension, embedding_dimension, lstm_dimension, hidden_dimension, output_dimension)
lstm.to(device)

criterion_fx = nn.BCELoss()

optimizer_fx = torch.optim.Adam(lstm.parameters(), lr=l_rate)

val_f1 = train_model(lstm, optimizer_fx, criterion_fx, num_epochs, thresh)

plot_f1s(val_f1)

Validation Set Evaluation - F1 Score: 0.0, Average Loss: 0.43365930177476575
Validation Set Evaluation - F1 Score: 0.5512541684790488, Average Loss: 0.3949943793576479
Validation Set Evaluation - F1 Score: 0.5830289407136836, Average Loss: 0.35463565957961385
Validation Set Evaluation - F1 Score: 0.5935608145294442, Average Loss: 0.342272166774403
Validation Set Evaluation - F1 Score: 0.5969431078403623, Average Loss: 0.3334907419418214
Validation Set Evaluation - F1 Score: 0.6069268829026938, Average Loss: 0.3277851699755756
Validation Set Evaluation - F1 Score: 0.6177707676130388, Average Loss: 0.3247854394302506
Validation Set Evaluation - F1 Score: 0.6193720774883099, Average Loss: 0.32235856593529283
Validation Set Evaluation - F1 Score: 0.6205775721965245, Average Loss: 0.32080818608825457
Validation Set Evaluation - F1 Score: 0.6254434870755196, Average Loss: 0.31843556533888245
Validation Set Evaluation - F1 Score: 0.6289002557544757, Average Loss: 0.31605699005185517
Validatio

KeyboardInterrupt: 

Parameters to optimize:

- max_len
- num_words
- hidden_dimension
- embedding_dim
- The dropout probability

In [69]:
best_model = LSTMModel(input_dimension, embedding_dimension, lstm_dimension, hidden_dimension, output_dimension)
best_model.load_state_dict(torch.load('best_model.pth'))

test_labels = {}

for transcription_id in test_set:
    utterances = []
    speakers = []
    types = []
    index = []
    transcription_edges = []
    with open(path_to_test / f"{transcription_id}.txt", 'r') as file:
        transcription_edges.append("-1 Beginning 0")
        for line in file:
            transcription_edges.append(line)
    tmp_types = [d.split(" ")[1] for d in transcription_edges]

    types = [[1*(t==tmp_t) for t in possible_types] for tmp_t in tmp_types]

    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        speakers.append([1*(speaker==utterance["speaker"]) for speaker in possible_speakers])
        utterances.append(utterance["text"])
        index.append([utterance["index"]])

    index = np.array(index)/len(index)

    dialogs_dataset_test = [utterances, types, speakers, index]


    tests_data_X = vectorize_sentences(dialogs_dataset_test[0], max_len)
    tests_data_X_types = np.array(dialogs_dataset_test[1])
    tests_data_X_speakers = np.array(dialogs_dataset_test[2])
    tests_data_X_indices = np.array(dialogs_dataset_test[3])

    test_dataset = TensorDataset(torch.from_numpy(tests_data_X).to(device), 
                            torch.from_numpy(tests_data_X_types).to(device), 
                            torch.from_numpy(tests_data_X_speakers).to(device), 
                            torch.from_numpy(tests_data_X_indices).float().to(device))

    test_data_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

    best_model.eval()
    predicted_labels = []
    with torch.no_grad():
        for (samples, types, speakers, index) in test_data_loader:
            outputs = best_model(samples, types, speakers, index)
            predicted_labels.extend(outputs.numpy())

    predicted_labels = [1 if pred > thresh else 0 for pred in predicted_labels]
    test_labels[transcription_id] = predicted_labels

with open("test_labels_lstm_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

