In [35]:
import json
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import backend as K
from keras.optimizers import Adam
from keras.metrics import AUC

In [36]:
# Fonction pour lire les données de transcription
def read_transcription(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Fonction pour lire les données du graphe de discours
def read_discourse_graph(file_path):
    with open(file_path, 'r') as file:
        data = [line.strip().split() for line in file]

    # Convertir les valeurs non numériques en indices numériques
    data = [(int(start), relation, int(end)) if start.isdigit() and end.isdigit() else (start, relation, end) for start, relation, end in data]

    return data

def create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict):
    rows = []

      # Iterate through all edges in the discourse graph
    for edge in discourse_graph:
        index_start, relation_type, index_end = edge

        # Retrieve speaker information
        speaker = transcription[index_start]['speaker']

        # Convert relation type to integer using the dictionary
        speaker_id = speaker_dict.get(speaker, -1)

        # Retrieve the sentence
        text = transcription[index_start]['text']

        # Convert relation type to integer using the dictionary
        relation_type_id = relation_dict.get(relation_type, -1)

        # Add a row to the DataFrame
        rows.append({
            'dialogue_id': dialogue_id,
            'index_start': index_start,
            'text': text,
            'index_end': index_end,
            'speaker_type': speaker_id,
            'speaker_text': speaker,
            'relation_type': relation_type_id,
            'relation_text': relation_type
        })

    # Create the DataFrame
    df = pd.DataFrame(rows)

    return df

# Fonction pour créer le dictionnaire de conversion des relations
def create_relation_dict(discourse_graph):
    relation_set = set()

    # Collecter toutes les relations uniques
    for edge in discourse_graph:
        relation_set.add(edge[1])

    # Créer un dictionnaire de conversion
    relation_dict = {relation: idx for idx, relation in enumerate(relation_set)}

    return relation_dict

# Fonction pour créer le dictionnaire de conversion des speakers
def create_speaker_dict(transcription):
    speaker_set = set()

    # Collecter tous les locuteurs uniques
    for utterance in transcription:
        speaker_set.add(utterance['speaker'])

    # Créer un dictionnaire de conversion
    speaker_dict = {speaker: idx for idx, speaker in enumerate(speaker_set)}

    return speaker_dict

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

# Function to get labels for a dialogue
def get_label(dialogue_id, index,labels_data):
    return labels_data.get(dialogue_id, [])[index]

# Définition de la fonction F1-score
def f1_score_keras(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)

    return K.mean(f1)


In [37]:
path_train= Path("data/training")
path_test= Path("data/test")

dialogue_ids = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
dialogue_ids = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in dialogue_ids])
dialogue_ids.remove('IS1002a')
dialogue_ids.remove('IS1005d')
dialogue_ids.remove('TS3012c')

dialogue_ids_test = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
dialogue_ids_test = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in dialogue_ids_test])

# Liste pour stocker les DataFrames de chaque dialogue
dfs = []
dfs_test = []


In [38]:
# Parcourir chaque dialogue
for dialogue_id in dialogue_ids:
    # Lire les données de transcription et de graphe de discours
    transcription = read_transcription(path_train / f'{dialogue_id}.json')
    discourse_graph = read_discourse_graph(path_train / f'{dialogue_id}.txt')
    
    # Créer le dictionnaire de conversion des relations
    relation_dict = create_relation_dict(discourse_graph)
    speaker_dict = create_speaker_dict(transcription)

    # Créer le DataFrame pour le dialogue actuel
    df = create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict)
    
    # Ajouter le DataFrame à la liste
    dfs.append(df)

    # Ajouter la dernière phrase avec NaN pour index_end et 'relation'
    last_utterance = transcription[-1]
    last_speaker = last_utterance['speaker']
    last_text = last_utterance['text']
    last_row = {
        'dialogue_id': dialogue_id,
        'index_start': len(transcription) - 1,
        'text': last_text,
        'index_end': 0,
        'speaker_type': speaker_dict.get(last_speaker, -1),
        'speaker_text': last_speaker,
        'relation_type': 0,
        'relation_text': np.nan
    }
    dfs.append(pd.DataFrame([last_row]))

# Parcourir chaque dialogue
for dialogue_id in dialogue_ids_test:
    # Lire les données de transcription et de graphe de discours
    transcription = read_transcription(path_test / f'{dialogue_id}.json')
    discourse_graph = read_discourse_graph(path_test / f'{dialogue_id}.txt')
    
    # Créer le dictionnaire de conversion des relations
    relation_dict = create_relation_dict(discourse_graph)
    speaker_dict = create_speaker_dict(transcription)

    # Créer le DataFrame pour le dialogue actuel
    df_test = create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict)
    
    # Ajouter le DataFrame à la liste
    dfs_test.append(df_test)

    # Ajouter la dernière phrase avec NaN pour index_end et 'relation'
    last_utterance = transcription[-1]
    last_speaker = last_utterance['speaker']
    last_text = last_utterance['text']
    last_row = {
        'dialogue_id': dialogue_id,
        'index_start': len(transcription) - 1,
        'text': last_text,
        'index_end': 0,
        'speaker_type': speaker_dict.get(last_speaker, -1),
        'speaker_text': last_speaker,
        'relation_type': 0,
        'relation_text': np.nan
    }
    dfs_test.append(pd.DataFrame([last_row]))


In [39]:
# Concaténer tous les DataFrames en un seul
df = pd.concat(dfs, ignore_index=True)
df_test = pd.concat(dfs_test, ignore_index=True)

with open("data/training_labels.json", 'r') as file:
    labels_data = json.load(file)

df['label'] = df.apply(lambda row: get_label(row['dialogue_id'], row['index_start'], labels_data), axis=1)


In [40]:
for transcription_id in dialogue_ids:
    bert_array = np.load('training/' + transcription_id + '.npy')
    
    # Obtenez les indices des lignes correspondant à la transcription_id
    indices = df[df['dialogue_id'] == transcription_id].index
    
    # Remplacez les valeurs de la colonne 'text' par les valeurs de bert_array
    for idx, value in enumerate(bert_array):
        df.at[indices[idx-1], 'text'] = value

for transcription_id in dialogue_ids_test:
    bert_array_test = np.load('test/' + transcription_id + '.npy')
    
    # Obtenez les indices des lignes correspondant à la transcription_id
    indices = df_test[df_test['dialogue_id'] == transcription_id].index
    
    # Remplacez les valeurs de la colonne 'text' par les valeurs de bert_array
    for idx, value in enumerate(bert_array_test):
        df_test.at[indices[idx-1], 'text'] = value


In [41]:
# Nombre d'éléments dans chaque liste
num_elements = len(df['text'].iloc[0])

# Créez de nouvelles colonnes pour chaque élément dans la liste
new_columns = [f'coord_{i}' for i in range(num_elements)]

# Appliquez une fonction qui divise chaque liste en plusieurs colonnes
new_text_columns = df['text'].apply(pd.Series)

# Renommez les nouvelles colonnes avec les noms spécifiques
new_text_columns.columns = new_columns

# Concaténez les nouvelles colonnes avec le DataFrame existant
df = pd.concat([df, new_text_columns], axis=1)

# Supprimez la colonne 'text' originale si nécessaire
df = df.drop(['text','speaker_text','relation_text' ], axis=1)





# Nombre d'éléments dans chaque liste
num_elements = len(df_test['text'].iloc[0])

# Appliquez une fonction qui divise chaque liste en plusieurs colonnes
new_text_columns_test = df_test['text'].apply(pd.Series)

# Renommez les nouvelles colonnes avec les noms spécifiques
new_text_columns_test.columns = new_columns

# Concaténez les nouvelles colonnes avec le DataFrame existant
df_test = pd.concat([df_test, new_text_columns_test], axis=1)

# Supprimez la colonne 'text' originale si nécessaire
df_test = df_test.drop(['text','speaker_text','relation_text' ], axis=1)


In [42]:
df['diff_index'] = df['index_end'] - df['index_start']
df_test['diff_index'] = df_test['index_end'] - df_test['index_start']

df_init = df.copy()
df_init_test = df_test.copy()

In [43]:
#Pour encoder dialogue_id

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['dialogue_id'] = label_encoder.fit_transform(df['dialogue_id'])
df_test['dialogue_id'] = label_encoder.fit_transform(df_test['dialogue_id'])



In [44]:
# Sélectionner les colonnes pour la matrice de corrélation
selected_columns = ['index_start', 'index_end', 'speaker_type', 'relation_type', 'label', 'diff_index']

# Créer un sous-dataframe avec les colonnes sélectionnées
corr_df = df[selected_columns]

# Calculer la matrice de corrélation
correlation_matrix = corr_df.corr()

# Tracer la matrice de corrélation colorée
#plt.figure(figsize=(10, 8))
#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
#plt.title('Matrice de Corrélation')
#plt.show()

In [45]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

X = df.drop(['label'], axis=1)

# Sélectionner la colonne 'label' comme y_train
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
y_train_ohe = to_categorical(y_train, num_classes=2)
y_test_ohe = to_categorical(y_test, num_classes=2)

y_ohe = to_categorical(y, num_classes=2)


In [57]:
lr = 0.001 
nb_epochs = 15
batch_nb = 32
val_split = 0.3
taux_drop = 0.3

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

#Ajout callback checkpoint
checkpoint = ModelCheckpoint('best_model.h5',
                             monitor='val_f1_score_keras',
                             save_best_only=True,
                             mode='max',
                             verbose=1)

#Callback pour réduire le lr en monitorant le f1_score
reduce_lr = ReduceLROnPlateau(monitor='val_f1_score_keras',
                              factor=0.5,
                              patience=2,
                              mode='max',
                              verbose=1)

liste_callbacks = [checkpoint]

In [58]:
#Réseaux de neurones
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, Input, LSTM
from keras.regularizers import l2

model = Sequential()
model.add(Input(390))
model.add(Dense(128, activation='relu'))
model.add(Dropout(taux_drop))
model.add(Dense(2, activation='sigmoid'))

# Utilisation de la fonction F1-score comme métrique
model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=[AUC(), f1_score_keras])

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

model.fit(X_train, y_train_ohe, epochs=nb_epochs, batch_size=batch_nb, validation_split=val_split, class_weight=dict(enumerate(class_weights)))

from sklearn.metrics import f1_score

y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred,axis=1)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [63]:
#Réseaux de neurones
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, Input, LSTM
from keras.regularizers import l2

model2 = Sequential()
model2.add(Input(390))
model2.add(Dense(128, activation='relu'))
model2.add(Dropout(taux_drop))
model2.add(Dense(2, activation='sigmoid'))

model2.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=[AUC(), f1_score_keras])

model2.fit(X, y_ohe, epochs=nb_epochs, batch_size=batch_nb, validation_split=val_split, class_weight=dict(enumerate(class_weights)), callbacks=liste_callbacks)
test_pred = model2.predict(df_test)
test_labels = np.argmax(test_pred,axis=1)

Epoch 1/15
Epoch 1: val_f1_score_keras did not improve from 0.45954
Epoch 2/15
Epoch 2: val_f1_score_keras did not improve from 0.45954
Epoch 3/15
Epoch 3: val_f1_score_keras did not improve from 0.45954
Epoch 4/15
Epoch 4: val_f1_score_keras did not improve from 0.45954
Epoch 5/15
Epoch 5: val_f1_score_keras did not improve from 0.45954
Epoch 6/15
Epoch 6: val_f1_score_keras did not improve from 0.45954
Epoch 7/15
Epoch 7: val_f1_score_keras did not improve from 0.45954
Epoch 8/15
Epoch 8: val_f1_score_keras did not improve from 0.45954
Epoch 9/15
Epoch 9: val_f1_score_keras did not improve from 0.45954
Epoch 10/15
Epoch 10: val_f1_score_keras did not improve from 0.45954
Epoch 11/15
Epoch 11: val_f1_score_keras did not improve from 0.45954
Epoch 12/15
Epoch 12: val_f1_score_keras did not improve from 0.45954
Epoch 13/15
Epoch 13: val_f1_score_keras did not improve from 0.45954
Epoch 14/15
Epoch 14: val_f1_score_keras did not improve from 0.45954
Epoch 15/15
Epoch 15: val_f1_score_ker

In [65]:
print("Nb 1 pour la validation:", sum(y_pred_class))
print("Nb theorique validation:", sum(y))
print("F1-score validation:", f1_score(y_test, y_pred_class),'\n')

print("Nb 1 sur test:", sum(test_labels))
print("Dans l'idée, environ 7500")

Nb 1 pour la validation: 12093
Nb theorique validation: 17689
F1-score validation: 0.4027976838846528 

Nb 1 sur test: 6624
Dans l'idée, environ 7500


In [61]:
# Créer un dictionnaire pour stocker les prédictions
test_labels_final = {dialogue_id: [] for dialogue_id in dialogue_ids_test}

# Parcourir les lignes de df_test_init
for dialogue_id in dialogue_ids_test:
    # Obtenez les indices des lignes correspondant au dialogue_id
    indices = df_init_test[df_init_test['dialogue_id'] == dialogue_id].index

    # Ajouter les valeurs de test_label[index] au dictionnaire
    test_labels_final[dialogue_id] = test_labels[indices].tolist()

with open("test_labels_text_mlp.json", "w") as file:
    json.dump(test_labels_final, file, indent=4)


In [62]:
print(test_labels_final)

{'ES2003a': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 'ES2003b': [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,