In [1]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.metrics import confusion_matrix
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

def read_transcription(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def create_dataframe(dialogue_id, transcription):
    rows = []

    # Iterate through all sentences in the transcription
    for index, sentence in enumerate(transcription):
        speaker = sentence['speaker']

        # Get the sentence text
        text = sentence['text']

        # Add a row to the DataFrame
        rows.append({
            'dialogue_id': dialogue_id,
            'index': index,
            'text': text,
            'speaker_text': speaker,
        })

    # Create the DataFrame
    df = pd.DataFrame(rows)

    return df

# Function to get labels for a dialogue
def get_label(dialogue_id, index,labels_data):
    return labels_data.get(dialogue_id, [])[index]

#Fonctions
def compter_mots(phrase):
    mots = phrase.split()  # Divisez la phrase en mots en utilisant les espaces comme délimiteurs
    return len(mots)

In [3]:
path_to_training = Path("data/training")
path_to_test = Path("data/test")

training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

# Créer le DataFrame pour l'ensemble d'entraînement
dfs = []
for dialogue_id in training_set:
    transcription_data = read_transcription(path_to_training / f'{dialogue_id}.json')
    df = create_dataframe(dialogue_id, transcription_data)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# Ajouter la colonne 'label' à df en utilisant la fonction get_label
with open("data/training_labels.json", 'r') as file:
    labels_data = json.load(file)

df['label'] = df.apply(lambda row: get_label(row['dialogue_id'], row['index'], labels_data), axis=1)

# Créer le DataFrame pour l'ensemble de test
dfs_test = []
for dialogue_id in test_set:
    transcription_data = read_transcription(path_to_test / f'{dialogue_id}.json')
    df_test = create_dataframe(dialogue_id, transcription_data)
    dfs_test.append(df_test)

df_test = pd.concat(dfs_test, ignore_index=True)

#Ajout features

df['nb_mots'] = df['text'].apply(compter_mots)
df['nb_interrogations'] = df['text'].apply(lambda x: x.count('?'))
df['interjections'] = df['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df['nb_words_more_5'] = df['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df = pd.concat([df, pd.get_dummies(df['speaker_text'], prefix='speaker_text', dtype=int)], axis=1) 


df_test['nb_mots'] = df_test['text'].apply(compter_mots)
df_test['nb_interrogations'] = df_test['text'].apply(lambda x: x.count('?'))
df_test['interjections'] = df_test['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test['nb_words_more_5'] = df_test['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test = pd.concat([df_test, pd.get_dummies(df_test['speaker_text'], prefix='speaker_text', dtype=int)], axis=1) 

import difflib
import networkx as nx

def calculate_similarity(str1, str2):
    seq = difflib.SequenceMatcher(None, str1, str2)
    return seq.ratio()

def calculate_max_similarity(graph, node):
    neighbors = list(graph.neighbors(node))
    if not neighbors:
        return 0.0  # Si le nœud n'a pas de voisins, la similarité est zéro
    similarities = [calculate_similarity(graph.nodes[node]['text'], graph.nodes[neighbor]['text']) for neighbor in neighbors]
    return max(similarities)

for node in graph.nodes:
    graph.nodes[node]['similarities'] = calculate_max_similarity(graph, node)
    
df.head()
df_test.head(25)

In [4]:
def extract_training():

    N_files = len(training_set)
    D_embedding = 384

    graphs = [] 

    # lets got throug all training graphs
    for k, transcription_id in enumerate(training_set):
        #nodes
        bert_array = np.load('training/' + transcription_id +'.npy')
        x = torch.tensor(bert_array.reshape(-1,384), dtype=torch.float)
        #edges
        edges = [[] for _ in range(16)]
        with open(path_to_training / f"{transcription_id}.txt", "r") as graphe:
            for line in graphe:
                l = line.split()
                i = int(l[0])
                j =  int(l[2])
                edge_type = label2int[l[1]] - 1
                edges[edge_type].append([i,j])
        edges = [torch.tensor(edges[k]).t().contiguous() for k in range(16)]
        #labels
        with open("data/training_labels.json", "r") as file:
            training_labels = json.load(file)
        labels = torch.tensor(np.array(training_labels[transcription_id]))
        graph = Data(x=x, edge_index=edges, y=labels)
        graphs.append(graph)
    return graphs 


def extract_test():

    N_files = len(training_set)
    D_embedding = 384

    graphs = [] 

    # lets got throug all training graphs
    for k, transcription_id in enumerate(test_set):
        #nodes
        bert_array = np.load('test/' + transcription_id +'.npy')
        x = torch.tensor(bert_array.reshape(-1,384), dtype=torch.float)
        #edges
        edges = [[] for _ in range(16)]
        with open(path_to_test / f"{transcription_id}.txt", "r") as graphe:
            for line in graphe:
                l = line.split()
                i = int(l[0])
                j =  int(l[2])
                edge_type = label2int[l[1]] - 1
                edges[edge_type].append([i,j])
        edges = [torch.tensor(edges[k]).t().contiguous() for k in range(16)]
        
        graph = Data(x=x, edge_index=edges)
        graphs.append(graph)
    return graphs

def f1_score(y_pred, y_real):
    conf_matrix = confusion_matrix(y_real, y_pred)
    tp, fp, fn, tn = conf_matrix[1, 1], conf_matrix[0, 1], conf_matrix[1, 0], conf_matrix[0, 0]
    if (tp + fp) == 0:
        return 0
    if (tp + fn) == 0:
        return 0
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        return 0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [5]:
graph_links_labels= set()
for id in training_set:
    with open(path_to_training / f"{id}.txt", "r") as graphe:
        for line in graphe:
            l = line.split()
            graph_links_labels.add(l[1])
L = list(graph_links_labels)

print(type(graph_links_labels))

int2label = {indice: valeur for indice, valeur in enumerate(L)}
label2int = {valeur: indice for indice, valeur in enumerate(L)}

N_vocab_links = len(L)
print(N_vocab_links)

nb_test = 20

data = extract_training()
data_test_kagle = extract_test()

data_train = data[:-nb_test]
data_test = data[-nb_test:]
train_loader = DataLoader(data_train)
test_loader = DataLoader(data_test)


<class 'set'>
16


In [6]:
import difflib
import networkx as nx

def calculate_similarity(str1, str2):
    seq = difflib.SequenceMatcher(None, str1, str2)
    return seq.ratio()

def calculate_max_similarity(graph, node):
    neighbors = list(graph.neighbors(node))
    if not neighbors:
        return 0.0  # Si le nœud n'a pas de voisins, la similarité est zéro
    similarities = [calculate_similarity(graph.nodes[node]['text'], graph.nodes[neighbor]['text']) for neighbor in neighbors]
    return max(similarities)

def compter_mots(phrase_tensor):
    # Convertir le tensor en une chaîne de caractères si c'est un tensor
    phrase = phrase_tensor.item() if isinstance(phrase_tensor, torch.Tensor) else phrase_tensor

    # Vérifier si la valeur est un nombre et le convertir en chaîne de caractères
    if isinstance(phrase, (int, float)):
        phrase = str(phrase)

    mots = phrase.split()  # Diviser la phrase en mots en utilisant les espaces comme délimiteurs
    return len(mots)

# Ajout de la colonne 'nb_mots' aux nœuds de chaque graphe
for graph in data:
    for node in range(graph.num_nodes):
        texte = graph.x[node, 0].item() if isinstance(graph.x[node, 0], torch.Tensor) else graph.x[node, 0]
        graph.x[node, 0] = compter_mots(texte)

# Ajout de la colonne 'nb_interrogations' aux nœuds de chaque graphe
for graph in data:
    for node in range(graph.num_nodes):
        # Convertir le tensor en une chaîne de caractères si c'est un tensor
        valeur_colonne = graph.x[node, 0]
        texte = valeur_colonne.item() if isinstance(valeur_colonne, torch.Tensor) else valeur_colonne

        # Vérifier si la valeur est un nombre et le convertir en chaîne de caractères
        if isinstance(texte, (int, float)):
            texte = str(texte)

        # Ajouter la colonne 'nb_interrogations' aux nœuds de chaque graphe
        graph.x[node, 1] = texte.count('?')

# Ajout de la colonne 'interjections' aux nœuds de chaque graphe
for graph in data:
    for node in range(graph.num_nodes):
        # Convertir le tensor en une chaîne de caractères si c'est un tensor
        valeur_colonne = graph.x[node, 0]
        texte = valeur_colonne.item() if isinstance(valeur_colonne, torch.Tensor) else valeur_colonne

        # Vérifier si la valeur est un nombre et le convertir en chaîne de caractères
        if isinstance(texte, (int, float)):
            texte = str(texte)

        # Ajouter la colonne 'interjections' aux nœuds de chaque graphe
        graph.x[node, 2] = sum(texte.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh'])

for graph in data:
    for node in range(graph.num_nodes):
        # Convertir le tensor en une chaîne de caractères si c'est un tensor
        valeur_colonne = graph.x[node, 0]
        texte = valeur_colonne.item() if isinstance(valeur_colonne, torch.Tensor) else valeur_colonne

        # Vérifier si la valeur est un nombre et le convertir en chaîne de caractères
        if isinstance(texte, (int, float)):
            texte = str(texte)

        # Ajouter la colonne 'nb_words_more_5' aux nœuds de chaque graphe
        graph.x[node, 3] = sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in texte.split())



In [7]:
class NodeClassifier(torch.nn.Module):
    def __init__(self, channels, input_dim):
        super(NodeClassifier, self).__init__()
        self.channels = channels
        self.sc1 = 100
        self.f1 = 50
        self.sc2 = 30
        self.GCN1 = nn.ModuleList([GCNConv(input_dim, self.sc1) for _ in range(channels)])
        self.dense1 = nn.Linear(self.sc1*channels, self.f1)
        self.GCN2 = nn.ModuleList([GCNConv(self.f1, self.sc2) for _ in range(channels)])
        self.dense2 = nn.Linear(self.sc2*channels, 2)

    def forward(self, data):
        nodes, edges = data.x, data.edge_index
        # Appliquez les couches GCN avec une activation ReLU entre elles
        x1 = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.sc1)
            else:
                x = F.relu(self.GCN1[k](nodes, edges[k]))
            x1.append(x)
        x1_f = torch.cat(x1, dim=1)

        f1 = F.relu(self.dense1(x1_f))

        x2 = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.sc2)
            else:
                x = F.relu(self.GCN2[k](f1, edges[k]))
            x1.append(x)
            x2.append(x)
        x2_f = torch.cat(x2, dim=1)

        x_out = self.dense2(x2_f)

        return F.log_softmax(x_out, dim=1)
    
    def validation_step(self, batch, batch_idx):
        out = self(batch)
        _, predicted = torch.max(out, 1)
        y_true = batch.y.cpu().numpy()
        f1 = f1_score(y_true, predicted.cpu().numpy())
        return {'val_f1': torch.tensor(f1)}
    
    def validation_epoch_end(self, outputs):
        avg_f1 = torch.stack([x['val_f1'] for x in outputs]).mean()
        return {'val_f1': avg_f1.item()}



In [8]:
# Instanciez le modèle
model = NodeClassifier(16,384)

# Définissez la fonction de perte et l'optimiseur
f = 0.3
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor([1-f, f]))
#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entraînez le modèle
def train():
    model.train()
    loss_tot = 0
    for data in train_loader:
        optimizer.zero_grad()  # Clear gradients.
        out = model(data)  # Perform a single forward pass.
        loss = criterion(out, data.y.long())  # Compute the loss solely based on the training nodes.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        loss_tot += loss
    return loss_tot

def test_during_training():
    model.eval()
    S = 0
    for data in test_loader:
        out = model(data)
        _, predicted = torch.max(out, 1)
        #print(predicted.numpy())
        f1 = f1_score(predicted.numpy(), data.y.numpy())
        S += f1
    f1_moyen = S / len(test_loader)
    f1_naive = f1_score(np.ones(len(predicted), dtype=int), data.y.numpy())
    print(f'F1-score: {f1_moyen}')

def prediction(graph):
    model.eval()
    out = model(graph)
    _, predicted = torch.max(out, 1)
    return predicted.numpy()

print("-----------------------------")
print('Data')
ones = sum([np.sum(g.y.numpy()) for g in data])
tot_nodes = sum([g.x.numpy().shape[0] for g in data])
print(ones, 'labels 1 sur ',tot_nodes,'noeuds soit', 100*ones/tot_nodes, '%')
print("-----------------------------")
print('Training set :', len(data_train),'graphs')
ones = sum([np.sum(g.y.numpy()) for g in data_train])
tot_nodes = sum([g.x.numpy().shape[0] for g in data_train])
print(ones, 'labels 1 sur ',tot_nodes,'noeuds soit', 100*ones/tot_nodes, '%')
print('Testing set :', len(data_test))
ones_test = sum([np.sum(g.y.numpy()) for g in data_test])
tot_nodes_test = sum([g.x.numpy().shape[0] for g in data_test])
print(ones_test, 'labels 1 sur ',tot_nodes_test,'noeuds soit', 100*ones_test/tot_nodes_test, '%')
print("-----------------------------")

for epoch in range(25):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    test_during_training()

ones_predicted = sum([np.sum(prediction(g)) for g in test_loader])
print("-----------------------------")
print("Test du modèle :")
print(ones_predicted, 'label 1 prédits sur les ',ones_test,'voulus (',100 * ones_predicted/tot_nodes_test,'%)')

-----------------------------
Data
13292 labels 1 sur  72623 noeuds soit 18.30274155570549 %
-----------------------------
Training set : 77 graphs
10580 labels 1 sur  53802 noeuds soit 19.66469647968477 %
Testing set : 20
2712 labels 1 sur  18821 noeuds soit 14.409436267998512 %
-----------------------------
Epoch: 000, Loss: 22.0672
F1-score: 0.0
Epoch: 001, Loss: 18.4379
F1-score: 0.0
Epoch: 002, Loss: 17.8609
F1-score: 0.0
Epoch: 003, Loss: 17.3682
F1-score: 0.2947866624730452
Epoch: 004, Loss: 16.7800
F1-score: 0.40070180028193825
Epoch: 005, Loss: 15.9383
F1-score: 0.46295696556385046
Epoch: 006, Loss: 14.6419
F1-score: 0.45437585577865586
Epoch: 007, Loss: 12.9970
F1-score: 0.4434668553042164
Epoch: 008, Loss: 10.9180
F1-score: 0.44035059732436466
Epoch: 009, Loss: 8.8597
F1-score: 0.442982908740885
Epoch: 010, Loss: 8.2705
F1-score: 0.46259577512230904
Epoch: 011, Loss: 8.5821
F1-score: 0.4771571977848795
Epoch: 012, Loss: 8.4131
F1-score: 0.46899330832570635
Epoch: 013, Loss: 

In [10]:
test_labels = {}

for i, graph in enumerate(data_test_kagle):
    id = test_set[i]
    y_test = prediction(graph)
    test_labels[id] = y_test.tolist()

with open("test_labels_GNN.json", "w") as file:
    json.dump(test_labels, file, indent=4)