In [4]:
import json
from pathlib import Path
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Feature Selection for training, Validation and testing sets.

Capture training and testing set as in the example

In [5]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

Cutting the training set into train_set and val_set.   (for validation)

One-hot encoding of speakers and capturing of texts for train, test, and val sets. Capturing of y for train and val sets.

In [6]:
total_text = []  #this tab will contain every text of the datas

list_speaker = ['PM','ME','ID','UI']

with open("training_labels.json", "r") as file:
    training_labels = json.load(file)

X_train_text = {}   #dic of the text for every discussion for the train
X_train_speaker = {}
y_train = {}

X_val_text = {}
X_val_speaker = {}
y_val = {}

#all the val set which represent 1/5 of training_set
val_set = ['ES2002a', 'ES2005b', 'ES2006c', 'ES2007d', 'ES2009a', 'ES2010b', 'ES2012c', 'ES2013d', 'ES2016a', 'IS1000b', 'IS1001c', 'IS1002d', 'IS1004a', 'IS1005b', 'IS1006c', 'IS1007d', 'TS3008a', 'TS3009b', 'TS3010c', 'TS3011d']
train_set = []  #the other 4/5 of the training_set

for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    X_train_text_transcription = []   #tab of the text for this discussion
    X_train_speaker_transcription = []   #tab of one_hot_encending of speaker for this discussion
    for utterance in transcription: 
        X_train_text_transcription.append(utterance["text"])
        total_text.append(utterance["text"])
        speaker_one_hot = [0,0,0,0]
        speaker_one_hot[list_speaker.index(utterance["speaker"])] = 1
        X_train_speaker_transcription.append(speaker_one_hot)
    if transcription_id in val_set:
        y_val[transcription_id] = training_labels[transcription_id]
        X_val_text[transcription_id] = X_train_text_transcription
        X_val_speaker[transcription_id] = X_train_speaker_transcription
    else:
        train_set.append(transcription_id)
        y_train[transcription_id] = training_labels[transcription_id]
        X_train_text[transcription_id] = X_train_text_transcription
        X_train_speaker[transcription_id] = X_train_speaker_transcription

X_test_text = {}
X_test_speaker = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    X_test_text_transcription = []
    X_test_speaker_transcription = []
    for utterance in transcription:
        X_test_text_transcription.append(utterance["text"])
        total_text.append(utterance["text"])
        speaker_one_hot = [0,0,0,0]
        speaker_one_hot[list_speaker.index(utterance["speaker"])] = 1
        X_test_speaker_transcription.append(speaker_one_hot)
    X_test_text[transcription_id] = X_test_text_transcription
    X_test_speaker[transcription_id] = X_test_speaker_transcription


We will now retrieve the previous text and the following texts. The previous text for the first text of each discussion is "BEGINNING". The following texts are concatenated, and this is where we see the reason for creating a dictionary of discussions rather than a large array of text previously.

We will also retrieve a one-hot encoding of the description preceding the sentence. There are 16 possible descriptions, but we add one extra for the first text, resulting in a vector of size 17. Additionally, we obtain a sum of the one-hot encodings of the descriptions following this text, resulting in a vector of size 16.

Sometimes in the conversation, there are no sentences following another (leaf of the conversation tree), so the text afterward is simply "" (empty), and the description is encoded by [0] * 16.

We give an example at the end of this next sell

In [7]:
X_train_text_before = {}   #sentence before 
X_train_text_after = {}   #all the sentence after (sometimes empty)

list_description = ["Parallel", "Correction", "Q-Elab", "Conditional", "Alternation", "Narration", "Background","Continuation", "Explanation", "Elaboration" , "Acknowledgement", "Comment", "Result", "Question-answer_pair", "Contrast", "Clarification_question"]

X_train_description_before = {}    #the one_hot_encoding of the description before
X_train_description_after = {}    #the sum of one_hot_encoding of the description after  (sometimes [0]*16)
for transcription_id in train_set:
    with open(path_to_training / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_train_text_before_transcription = ["BEGGINING"]   #the sentence before the first sentence
    total_text.append("BEGGINING") 

    X_train_text_after_transcription = [""] * (len(transcription) + 1)   
    
    vector = [0] * 16
    X_train_description_after_transcription = [vector.copy() for i in range(len(transcription) + 1)] 

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_train_description_before_transcription = [description_one_hot]   #the description before the first description is always [1,0,0,...]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_train_description_before_transcription.append(description_one_hot)

        X_train_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1
        
        X_train_text_before_transcription.append(X_train_text[transcription_id][int(tab[0])])

        X_train_text_after_transcription[int(tab[0])] += X_train_text[transcription_id][int(tab[2])]

    X_train_text_before[transcription_id] = X_train_text_before_transcription
    X_train_text_after[transcription_id] = X_train_text_after_transcription
    X_train_description_before[transcription_id] = X_train_description_before_transcription
    X_train_description_after[transcription_id] = X_train_description_after_transcription

#we did exactement the same for the validation
X_val_text_before = {}
X_val_text_after = {}
X_val_description_before = {}
X_val_description_after = {}
for transcription_id in val_set:
    with open(path_to_training / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_val_text_before_transcription = ["BEGGINING"]
    total_text.append("BEGGINING")

    X_val_text_after_transcription = [""] * (len(transcription) + 1)

    vector = [0] * 16
    X_val_description_after_transcription = [vector.copy() for i in range(len(transcription) + 1)]

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_val_description_before_transcription = [description_one_hot]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_val_description_before_transcription.append(description_one_hot)

        X_val_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1
        
        X_val_text_before_transcription.append(X_val_text[transcription_id][int(tab[0])])

        X_val_text_after_transcription[int(tab[0])] += X_val_text[transcription_id][int(tab[2])]

    X_val_text_before[transcription_id] = X_val_text_before_transcription
    X_val_text_after[transcription_id] = X_val_text_after_transcription
    X_val_description_before[transcription_id] = X_val_description_before_transcription
    X_val_description_after[transcription_id] = X_val_description_after_transcription

#we did exactement the same for the test
X_test_text_before = {}
X_test_text_after = {}
X_test_description_before = {}
X_test_description_after = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_test_text_before_transcription = ["BEGGINING"]
    total_text.append("BEGGINING")

    X_test_text_after_transcription = [""] * (len(transcription) + 1)

    t = [0] * 16
    X_test_description_after_transcription = [t.copy() for i in range(len(transcription) + 1)]

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_test_description_before_transcription = [description_one_hot]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_test_description_before_transcription.append(description_one_hot)

        X_test_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1

        X_test_text_before_transcription.append(X_test_text[transcription_id][int(tab[0])])

        X_test_text_after_transcription[int(tab[0])] += X_test_text[transcription_id][int(tab[2])]

    X_test_text_before[transcription_id] = X_test_text_before_transcription
    X_test_text_after[transcription_id] = X_test_text_after_transcription
    X_test_description_before[transcription_id] = X_test_description_before_transcription
    X_test_description_after[transcription_id] = X_test_description_after_transcription


#example

print(X_train_text["TS3012d"][0])  #the first sentence
print(X_train_speaker["TS3012d"][0])    #one_hot_encoding of the speaker
print(X_train_text_before["TS3012d"][0])    #the sentence before (here BEGGINING)
print(X_train_text_after["TS3012d"][0])    #all the sentence after   (here there is just one sentence)
print(X_train_description_before["TS3012d"][0])   #one_hot_encoding of the description before (here the particular case of the first sentence)
print(X_train_description_after["TS3012d"][0])    #sum of one_hot_encoding of the descrption after

Can I close this ?
[1, 0, 0, 0]
BEGGINING
Uh we don't have any changes ,
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


Now, we no longer need the conversation segmentation (it was simpler for retrieving sentences afterward). Therefore, we concatenate all conversations into large vectors.

In [8]:
X_train_text_tab = []
X_train_speaker_tab = []
X_train_text_before_tab = []
X_train_text_after_tab = []
X_train_description_before_tab = []
X_train_description_after_tab = []
y_train_tab = []

for transcription_id in train_set:
    X_train_text_tab.extend(X_train_text[transcription_id])
    X_train_speaker_tab.extend(X_train_speaker[transcription_id])
    X_train_text_before_tab.extend(X_train_text_before[transcription_id])
    X_train_text_after_tab.extend(X_train_text_after[transcription_id])
    X_train_description_before_tab.extend(X_train_description_before[transcription_id])
    X_train_description_after_tab.extend(X_train_description_after[transcription_id])
    y_train_tab.extend(y_train[transcription_id])

X_val_text_tab = []
X_val_speaker_tab = []
X_val_text_before_tab = []
X_val_text_after_tab = []
X_val_description_before_tab = []
X_val_description_after_tab = []
y_val_tab = []

for transcription_id in val_set:
    X_val_text_tab.extend(X_val_text[transcription_id])
    X_val_speaker_tab.extend(X_val_speaker[transcription_id])
    X_val_text_before_tab.extend(X_val_text_before[transcription_id])
    X_val_text_after_tab.extend(X_val_text_after[transcription_id])
    X_val_description_before_tab.extend(X_val_description_before[transcription_id])
    X_val_description_after_tab.extend(X_val_description_after[transcription_id])
    y_val_tab.extend(y_val[transcription_id])

X_test_text_tab = []
X_test_speaker_tab = []
X_test_text_before_tab = []
X_test_text_after_tab = []
X_test_description_before_tab = []
X_test_description_after_tab = []

for transcription_id in test_set:
    X_test_text_tab.extend(X_test_text[transcription_id])
    X_test_speaker_tab.extend(X_test_speaker[transcription_id])
    X_test_text_before_tab.extend(X_test_text_before[transcription_id])
    X_test_text_after_tab.extend(X_test_text_after[transcription_id])
    X_test_description_before_tab.extend(X_test_description_before[transcription_id])
    X_test_description_after_tab.extend(X_test_description_after[transcription_id])

We use the same tokenizer and vectorizer than the TD, for us it has the best result

In [9]:
tokenizer = get_tokenizer("basic_english")

words=[]
num_words = 1500

total_text.append("First")

for text in total_text:
    tokens=tokenizer(text)
    words.extend(tokens)

top = dict(Counter(words).most_common(1500))
vocab = torchtext.vocab.vocab(top, specials = ['<unk>', '<pad>'])

vocab.set_default_index(vocab['<unk>'])

In [10]:
max_len=60   #our first hyperparameter 

def vectorize_sentences(reviews, max_len):
    vectors=[]
    for text in reviews:
        tokens=tokenizer(text)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)

We vectorize our data 

In [11]:
X_tr_text_vector = vectorize_sentences(X_train_text_tab, max_len)
X_tr_speaker_vector = np.array(X_train_speaker_tab)
X_tr_text_before_vector = vectorize_sentences(X_train_text_before_tab, max_len)
X_tr_text_after_vector = vectorize_sentences(X_train_text_after_tab, max_len)
X_tr_description_before_vector = np.array(X_train_description_before_tab)
X_tr_description_after_vector = np.array(X_train_description_after_tab)
y_tr_vector = np.array(y_train_tab).reshape(-1,1)

X_va_text_vector = vectorize_sentences(X_val_text_tab, max_len)
X_va_speaker_vector = np.array(X_val_speaker_tab)
X_va_text_before_vector = vectorize_sentences(X_val_text_before_tab, max_len)
X_va_text_after_vector = vectorize_sentences(X_val_text_after_tab, max_len)
X_va_description_before_vector = np.array(X_val_description_before_tab)
X_va_description_after_vector = np.array(X_val_description_after_tab)
y_va_vector = np.array(y_val_tab).reshape(-1,1)

We transform our data into tensor and cut by batches, we can see that we shuffle all our data and we didn't keep the structure by discussions.

In [12]:
batch_size = 64   #our second hyperparameter

X_tr_text_tensor = torch.tensor(X_tr_text_vector).to(device)
X_tr_text_before_tensor = torch.tensor(X_tr_text_before_vector).to(device)
X_tr_text_after_tensor = torch.tensor(X_tr_text_after_vector).to(device)
X_tr_description_before_tensor = torch.tensor(X_tr_description_before_vector, dtype=torch.float32).to(device)
X_tr_description_after_tensor = torch.tensor(X_tr_description_after_vector, dtype=torch.float32).to(device)
X_tr_speaker_tensor = torch.tensor(X_tr_speaker_vector, dtype=torch.float32).to(device)
y_tr_tensor = torch.tensor(y_tr_vector, dtype=torch.float32).to(device)

X_va_text_tensor = torch.tensor(X_va_text_vector).to(device)
X_va_text_before_tensor = torch.tensor(X_va_text_before_vector).to(device)
X_va_text_after_tensor = torch.tensor(X_va_text_after_vector).to(device)
X_va_description_before_tensor = torch.tensor(X_va_description_before_vector, dtype=torch.float32).to(device)
X_va_description_after_tensor = torch.tensor(X_va_description_after_vector, dtype=torch.float32).to(device)
X_va_speaker_tensor = torch.tensor(X_va_speaker_vector, dtype=torch.float32).to(device)
y_va_tensor = torch.tensor(y_va_vector, dtype=torch.float32).to(device)

trainset = TensorDataset(X_tr_text_tensor, X_tr_text_before_tensor, X_tr_text_after_tensor, X_tr_description_before_tensor, X_tr_description_after_tensor, X_tr_speaker_tensor, y_tr_tensor)
valset = TensorDataset(X_va_text_tensor, X_va_text_before_tensor, X_va_text_after_tensor, X_va_description_before_tensor, X_va_description_after_tensor, X_va_speaker_tensor, y_va_tensor)

train_loader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(valset, shuffle=True, batch_size=batch_size)

Here there are some of our models, we try to show that we complexified our models step by step. Our best model is GRU_LSTM_final. Some models use just the sentence, other use different informations.

In [13]:
class LinearModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, max_len):
        super(LinearModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.fc1 = nn.Linear(max_len*embedding_dim, hidden_dim) 
        
        self.fc2 = nn.Linear(hidden_dim, 1) 
        
    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded = self.embedding(text)

        embedded = embedded.view(-1, max_len * embedding_dim)

        out = self.fc1(embedded)
        
        out = torch.relu(out)

        out = self.fc2(out)
        
        out = torch.sigmoid(out)

        return out

In [14]:
class ConvModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_filter, max_len):
        super(ConvModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.conv_0 = nn.Conv2d(1, num_filter, (3, embedding_dim))
        self.conv_1 = nn.Conv2d(1, num_filter, (4, embedding_dim))
        self.conv_2 = nn.Conv2d(1, num_filter, (5, embedding_dim))

        self.maxpool_0 = nn.MaxPool2d((max_len - 3 + 1, 1))
        self.maxpool_1 = nn.MaxPool2d((max_len - 4 + 1, 1))
        self.maxpool_2 = nn.MaxPool2d((max_len - 5 + 1, 1))

        self.fc = nn.Linear(num_filter * 3, 1)

        self.dropout = nn.Dropout(0.5)
        
    def forward(self, text, text_before, text_after, description_before, description_after, speaker): 
        embedded = self.embedding(text) 
        embedded = embedded.unsqueeze(1)

        x0 = torch.relu(self.conv_0(embedded))
        x1 = torch.relu(self.conv_1(embedded))
        x2 = torch.relu(self.conv_2(embedded))

        x0 = self.maxpool_0(x0)
        x1 = self.maxpool_1(x1)
        x2 = self.maxpool_2(x2)
        
        x = torch.cat((x0, x1, x2), dim=1)

        x = x.view(x.size(0), -1)
        
        x = torch.relu(self.fc(x))

        x = self.dropout(x)

        x = torch.sigmoid(x)
        return x

In [15]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=0.4, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        out = torch.sigmoid(self.fc(hidden))
        return out

In [16]:
class GRU(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(GRU, self).__init__()
        self.embeding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embed_dim)
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=3, dropout=0.5, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)

    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded = self.embeding(text)
        output, hidden = self.gru(embedded)
        truncated = output[:, -1 , :]
        out = self.linear(truncated)
        out = torch.sigmoid(out)
        return(out)

In [17]:
class MultiInputModel(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, hidden_dim_2):
        super(MultiInputModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim + 17 + 16, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, hidden_dim_2)
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(hidden_dim_2, 1)
        
    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded = self.embedding(text)
        embedded = torch.cat((embedded, description_before.unsqueeze(1).expand(-1, text.size(1), -1)), dim=2)
        embedded = torch.cat((embedded, description_after.unsqueeze(1).expand(-1, text.size(1), -1)), dim=2)
        embedded = torch.relu(embedded)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.dropout(lstm_out)
        out = torch.relu(self.fc(lstm_out))
        out = self.fc2(out)
        out = torch.sigmoid(out)
        return out

In [18]:
class GRU_LSTM(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, hidden_dim_2):
        super(GRU_LSTM, self).__init__()
        self.embeding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embed_dim)
        self.gru = nn.GRU(input_size=embed_dim + 16, hidden_size=hidden_dim, num_layers=3, dropout=0.5, batch_first=True)
        self.linear = nn.Linear(hidden_dim + hidden_dim_2, 1)
        self.lstm = nn.LSTM(embed_dim + 16, hidden_dim_2, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.5)

    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded = self.embeding(text)
        embedded = torch.cat((embedded, description_after.unsqueeze(1).expand(-1, text.size(1), -1)), dim=2)
        embedded = torch.relu(embedded)

        output, hidden = self.gru(embedded)
        truncated = output[:, -1 , :]
        truncated = self.dropout(truncated)

        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.dropout2(lstm_out)

        x = torch.cat([truncated, lstm_out], dim=1)

        out = self.linear(x)

        out = torch.sigmoid(out)
        
        return(out)

In [19]:
class GRU_LSTM_final(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, hidden_dim_2, num_layers):
        super(GRU_LSTM_final, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, dropout=0.5, batch_first=True)
        self.gru2 = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, dropout=0.5, batch_first=True)
        self.gru3 = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, dropout=0.5, batch_first=True)

        self.lstm = nn.LSTM(embed_dim, hidden_dim_2, num_layers=num_layers, dropout=0.5, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(embed_dim, hidden_dim_2, num_layers=num_layers, dropout=0.5, bidirectional=True, batch_first=True)
        self.lstm3 = nn.LSTM(embed_dim, hidden_dim_2, num_layers=num_layers, dropout=0.5, bidirectional=True, batch_first=True)

        self.linear_gru = nn.Linear(hidden_dim * 3, 64)

        self.linear_lstm = nn.Linear(hidden_dim_2 * 6, 64)

        self.linear2 = nn.Linear(64 * 2 + 17 + 16 + 4, 1)

        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text, text_before, text_after, description_before, description_after, speaker):
        embedded_text = self.dropout(self.embedding(text))
        embedded_before = self.dropout(self.embedding(text_before))
        embedded_after = self.dropout(self.embedding(text_after))
        
        output_before, _ = self.gru(embedded_before)
        output_text, _ = self.gru2(embedded_text)
        output_after, _ = self.gru3(embedded_after)

        t_text = output_text[:, -1, :] #we keep the last word because it collected information from the other words
        t_before = output_before[:, -1, :]
        t_after = output_after[:, -1, :]

        out1 = torch.cat([t_text, t_before, t_after], dim=1)

        out1 = self.linear_gru(out1)
        
        output_before, _ = self.lstm(embedded_before)
        output_text, _ = self.lstm2(embedded_text)
        output_after, _ = self.lstm3(embedded_after)

        t_text = output_text[:, -1, :]  #we keep the last word because it collected information from the other words
        t_before = output_before[:, -1, :]
        t_after = output_after[:, -1, :]

        out2 = torch.cat([t_text, t_before, t_after], dim=1)

        out2 = self.linear_lstm(out2)

        x = torch.cat([out1, out2, description_before, description_after, speaker], dim=1)

        out = self.linear2(x)

        out = torch.sigmoid(out)

        return out

We can now create our model, we can see that we have a lot of hyperparameters but some model just take a few parameters

In [20]:
#our differents hyperparameters
input_dim = num_words + 2 #add 2 for <unk> and <pad> symbols
embedding_dim = 50
num_filter = 25
hidden_dim = 32
hidden_dim_2 = 16
num_layers = 2

model = GRU_LSTM_final(input_dim, embedding_dim, hidden_dim, hidden_dim_2, num_layers)
model.to(device)

GRU_LSTM_final(
  (embedding): Embedding(1502, 50)
  (gru): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (gru2): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (gru3): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (lstm): LSTM(50, 16, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (lstm2): LSTM(50, 16, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (lstm3): LSTM(50, 16, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear_gru): Linear(in_features=96, out_features=64, bias=True)
  (linear_lstm): Linear(in_features=96, out_features=64, bias=True)
  (linear2): Linear(in_features=165, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

Here is our train fonction, at each epochs we print the f1 score on the validation set, for differents thresholds. We keep the best model in terms of f1 score on the validation set

In [21]:
best_f1 = 0
def train_model(model, lr, best_f1):
    loss_criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    num_epochs = 5
    history_val_f1 = []
    for epoch in range(num_epochs):
        loss_tot = 0
        iter = 0
        for text, text_before, text_after, description_before, description_after, speaker, y in train_loader:
            model.train()
            optimizer.zero_grad()
            outputs = model(text, text_before, text_after, description_before, description_after, speaker)
            loss = loss_criterion(outputs, y)
            loss_tot += loss
            iter += 1
            loss.backward()
            optimizer.step()              
        
        model.eval()

        true_labels = []
        predicted_labels = []

        with torch.no_grad():
            for text, text_before, text_after, description_before, description_after, speaker, y in val_loader:
                outputs = model(text, text_before, text_after, description_before, description_after, speaker)
                predicted_labels.extend(outputs.cpu().numpy())
                true_labels.extend(y.cpu().numpy())

        predicted_labels1 = [1 if pred > 0.3 else 0 for pred in predicted_labels]
        predicted_labels2 = [1 if pred > 0.20 else 0 for pred in predicted_labels]
        predicted_labels3 = [1 if pred > 0.4 else 0 for pred in predicted_labels]
        loss_tot = loss_tot / iter

        f11 = f1_score(true_labels, predicted_labels1)
        f12 = f1_score(true_labels, predicted_labels2)
        f13 = f1_score(true_labels, predicted_labels3)

        print(f'{epoch} val Set Evaluation - F1 Score: {f11}' + f'     loss : {loss_tot}')
        print(f'{epoch} val Set Evaluation - F1 Score: {f12}' + f'     loss : {loss_tot}')
        print(f'{epoch} val Set Evaluation - F1 Score: {f13}' + f'     loss : {loss_tot}')
        print("")

        history_val_f1.append(f11)

        if f11 > best_f1:
            best_f1 = f11
            torch.save(model.state_dict(), 'best_model.pth')

    print(best_f1)
    return (history_val_f1, best_f1)

In [22]:
history_val_f1, best_f1 = train_model(model, 0.003, best_f1)

KeyboardInterrupt: 

In [None]:
def plot_losses(history_val_f1):
    plt.style.use('bmh')
    plt.rc('axes', facecolor='none')
    plt.rc('figure', figsize=(16, 4))

    plt.plot(history_val_f1, label='Validation')
    plt.title('F1 Graph')
    plt.legend()
    plt.show()

We have already trained some good model that you can test on the validation set.

In [23]:
embedding_dim = 70
hidden_dim = 32
hidden_dim_2 = 16

model1 = GRU_LSTM(input_dim, embedding_dim, hidden_dim, hidden_dim_2)
model1.load_state_dict(torch.load('best_modelGL.pth'))
model1.eval()


embedding_dim = 30
hidden_dim = 32

model2 = GRU(input_dim, embedding_dim, hidden_dim)
model2.load_state_dict(torch.load('best_modelG1.pth'))
model2.eval()


embedding_dim = 50
hidden_dim = 16
hidden_dim_2 = 32
num_layers = 2

model3 = GRU_LSTM_final(input_dim, embedding_dim, hidden_dim, hidden_dim_2, num_layers)
model3.load_state_dict(torch.load('best_modelGL2.pth'))
model3.eval()


embedding_dim = 50
hidden_dim = 32
hidden_dim_2 = 32
num_layers = 2

model4 = GRU_LSTM_final(input_dim, embedding_dim, hidden_dim, hidden_dim_2, num_layers)
model4.load_state_dict(torch.load('best_modelGL2_2.pth'))
model4.eval()

GRU_LSTM_final(
  (embedding): Embedding(1502, 50)
  (gru): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (gru2): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (gru3): GRU(50, 32, num_layers=2, batch_first=True, dropout=0.5)
  (lstm): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (lstm2): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (lstm3): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear_gru): Linear(in_features=96, out_features=64, bias=True)
  (linear_lstm): Linear(in_features=192, out_features=64, bias=True)
  (linear2): Linear(in_features=165, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

We have implemented a fonction that can make the sum of result of our different model (the result are regression so it can be sum) with different weights. It improved our results a lot

In [24]:
def train_fusion_model(models, alphas):
    true_labels = []
    predicted_labels = []
    s = sum(alphas)
    with torch.no_grad():
        for text, text_before, text_after, description_before, description_after, speaker, y in val_loader:
            outputs = np.array([model(text, text_before, text_after, description_before, description_after, speaker) for model in models])
            outputs_final = np.zeros(len(text), dtype=np.float64)
            for i in range(len(models)):
                weighted_output = (alphas[i]/s) * outputs[i] 
                outputs_final += weighted_output.reshape(len(text))
            predicted_labels.extend(outputs_final.tolist())
            true_labels.extend(y.cpu().numpy())
    
    predicted_labels2 = [1 if pred > 0.3 else 0 for pred in predicted_labels]
    f1 = f1_score(true_labels, predicted_labels2)
    print(f1)

    return alphas, f1

In [25]:
alphas, f1 = train_fusion_model([model1, model2, model3, model4], [2.,1.,1.,1.])

KeyboardInterrupt: 

Now we just have to transform our testing data as the training data and apply the model that give the best result for the validation set and make the submission. We keep the same threshold for the testing set.

In [None]:
test_labels = {}
model1.eval()
model2.eval()
model3.eval()
model4.eval()

for transcription_id in test_set:
    text = X_test_text[transcription_id]
    text = np.array(vectorize_sentences(text, max_len))

    before = X_test_text_before[transcription_id]
    before = np.array(vectorize_sentences(before, max_len))

    after = X_test_text_after[transcription_id]
    after = np.array(vectorize_sentences(after, max_len))

    speaker = X_test_speaker[transcription_id]
    speaker = np.array(speaker)

    description_before = X_test_description_before[transcription_id]
    description_before = np.array(description_before)

    description_after = X_test_description_after[transcription_id]
    description_after = np.array(description_after)

    text = torch.tensor(text).to(device)
    before = torch.tensor(before).to(device)
    after = torch.tensor(after).to(device)
    description_before = torch.tensor(description_before, dtype=torch.float32).to(device)
    description_after = torch.tensor(description_after, dtype=torch.float32).to(device)
    speaker = torch.tensor(speaker, dtype=torch.float32).to(device)

    testset = TensorDataset(text, before, after, description_before, description_after, speaker)
    
    test_loader = DataLoader(testset, shuffle=False, batch_size=len(text))

    predicted_labels = []
    with torch.no_grad():
        for text, text_before, text_after, description_before, description_after, speaker in test_loader:
            outputs1 = model2(text, text_before, text_after, description_before, description_after, speaker)
            outputs2 = model1(text, text_before, text_after, description_before, description_after, speaker)
            outputs3 = model3(text, text_before, text_after, description_before, description_after, speaker)
            outputs4 = model4(text, text_before, text_after, description_before, description_after, speaker)
            outputs = (2 * outputs1.cpu().numpy() + outputs3.cpu().numpy() + outputs2.cpu().numpy() + 2 * outputs4.cpu().numpy()) / 6
            predicted_labels.extend(outputs)

    # Convert predicted probabilities to binary predictions
    predicted_labels = [1 if pred > 0.3 else 0 for pred in predicted_labels]

    test_labels[transcription_id] = predicted_labels

with open("test_labels.json", "w") as file:
    json.dump(test_labels, file, indent=4)

In [None]:
def make_submission(json_path: Path = Path("test_labels.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value) 
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

make_submission(Path("test_labels.json"))