In [40]:
import json
import pandas as pd
from pathlib import Path
import numpy as np

In [41]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

def read_transcription(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def create_dataframe(dialogue_id, transcription):
    rows = []

    # Iterate through all sentences in the transcription
    for index, sentence in enumerate(transcription):
        speaker = sentence['speaker']

        # Get the sentence text
        text = sentence['text']

        # Add a row to the DataFrame
        rows.append({
            'dialogue_id': dialogue_id,
            'index': index,
            'text': text,
            'speaker_text': speaker,
        })

    # Create the DataFrame
    df = pd.DataFrame(rows)

    return df

# Function to get labels for a dialogue
def get_label(dialogue_id, index,labels_data):
    return labels_data.get(dialogue_id, [])[index]

#Fonctions
def compter_mots(phrase):
    mots = phrase.split()  # Divisez la phrase en mots en utilisant les espaces comme délimiteurs
    return len(mots)

In [42]:
# Remplacez 'votre_chemin' par le chemin correct
path_train= Path("data/training")
path_test= Path("data/test")

# Remplacez 'vos_dialogue_ids' par votre liste réelle d'identifiants de dialogue
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [43]:
# Créer le DataFrame pour l'ensemble d'entraînement
dfs = []
for dialogue_id in training_set:
    transcription_data = read_transcription(path_train / f'{dialogue_id}.json')
    df = create_dataframe(dialogue_id, transcription_data)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# Ajouter la colonne 'label' à df en utilisant la fonction get_label
with open("data/training_labels.json", 'r') as file:
    labels_data = json.load(file)

df['label'] = df.apply(lambda row: get_label(row['dialogue_id'], row['index'], labels_data), axis=1)

# Créer le DataFrame pour l'ensemble de test
dfs_test = []
for dialogue_id in test_set:
    transcription_data = read_transcription(path_test / f'{dialogue_id}.json')
    df_test = create_dataframe(dialogue_id, transcription_data)
    dfs_test.append(df_test)

df_test = pd.concat(dfs_test, ignore_index=True)

In [44]:
#Ajout features

df['nb_mots'] = df['text'].apply(compter_mots)
df['nb_interrogations'] = df['text'].apply(lambda x: x.count('?'))
df['nb_occurences'] = df['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df['nb_words_more_5'] = df['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))

df_test['nb_mots'] = df_test['text'].apply(compter_mots)
df_test['nb_interrogations'] = df_test['text'].apply(lambda x: x.count('?'))
df_test['nb_occurences'] = df_test['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test['nb_words_more_5'] = df_test['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))


In [45]:
df['bert'] = df['text']
for transcription_id in training_set:
    bert_array = np.load('training/' + transcription_id + '.npy')
    
    # Obtenez les indices des lignes correspondant à la transcription_id
    indices = df[df['dialogue_id'] == transcription_id].index
    
    # Remplacez les valeurs de la colonne 'text' par les valeurs de bert_array
    for idx, value in enumerate(bert_array):
        df.at[indices[idx-1], 'bert'] = value

df_test['bert'] = df_test['text']
for transcription_id in test_set:
    bert_array_test = np.load('test/' + transcription_id + '.npy')
    
    # Obtenez les indices des lignes correspondant à la transcription_id
    indices = df_test[df_test['dialogue_id'] == transcription_id].index
    
    # Remplacez les valeurs de la colonne 'text' par les valeurs de bert_array
    for idx, value in enumerate(bert_array_test):
        df_test.at[indices[idx-1], 'bert'] = value


In [46]:
# Nombre d'éléments dans chaque liste
num_elements = len(df['bert'].iloc[0])

# Créez de nouvelles colonnes pour chaque élément dans la liste
new_columns = [f'coord_{i}' for i in range(num_elements)]

# Appliquez une fonction qui divise chaque liste en plusieurs colonnes
new_text_columns = df['bert'].apply(pd.Series)

# Renommez les nouvelles colonnes avec les noms spécifiques
new_text_columns.columns = new_columns

# Concaténez les nouvelles colonnes avec le DataFrame existant
df = pd.concat([df, new_text_columns], axis=1)


# Nombre d'éléments dans chaque liste
num_elements = len(df_test['bert'].iloc[0])

# Appliquez une fonction qui divise chaque liste en plusieurs colonnes
new_text_columns_test = df_test['bert'].apply(pd.Series)

# Renommez les nouvelles colonnes avec les noms spécifiques
new_text_columns_test.columns = new_columns

# Concaténez les nouvelles colonnes avec le DataFrame existant
df_test = pd.concat([df_test, new_text_columns_test], axis=1)


In [48]:
df.head()
df_test.head(25)

Unnamed: 0,dialogue_id,index,text,speaker_text,nb_mots,nb_interrogations,nb_occurences,nb_words_more_5,bert,coord_0,...,coord_374,coord_375,coord_376,coord_377,coord_378,coord_379,coord_380,coord_381,coord_382,coord_383
0,ES2003a,0,"Okay , well",PM,3,0,0,0,"[-0.06684376, -0.10767134, 0.00158493, -0.0377...",-0.066844,...,0.089991,0.01777,0.004204,0.015277,-0.003793,0.035303,0.063118,-0.012957,0.057301,-0.023757
1,ES2003a,1,I think we're ready to begin .,PM,7,0,0,0,"[-0.07298628, 0.052574955, -0.0014349254, -0.0...",-0.072986,...,0.084046,0.017392,-0.011899,0.002536,0.007794,0.037757,0.123899,0.037111,0.105005,0.098142
2,ES2003a,2,"Right ,",PM,2,0,0,0,"[-0.069116786, -0.030909952, 0.07359838, -0.06...",-0.069117,...,0.057107,-0.056366,0.004875,0.024899,-0.031351,-0.027805,0.029868,-0.065823,-0.027464,-0.058381
3,ES2003a,3,"my name's Adam Duguid ,",PM,5,0,0,2,"[-0.08550309, -0.08060705, 0.04556774, 0.04994...",-0.085503,...,0.054002,0.058658,-0.060597,0.02625,-0.028345,0.014901,0.068747,0.050202,0.06457,0.00954
4,ES2003a,4,"we're here because of real reaction ,",PM,7,0,0,2,"[-0.022576354, -0.028672846, -0.011893472, -0....",-0.022576,...,0.027556,0.016066,-0.029471,-0.008377,-0.026504,-0.055742,0.04369,-0.029209,0.006706,-0.045938
5,ES2003a,5,"um , we have in the group",PM,7,0,1,0,"[-0.08335316, -0.004626364, 0.023742221, -0.03...",-0.083353,...,0.10015,0.059034,0.038118,-0.042511,0.032334,0.02794,0.106686,0.012388,0.107615,-0.027225
6,ES2003a,6,"Oh ,",ME,2,0,0,0,"[-0.040407237, 0.060790785, -0.0016114342, 0.0...",-0.040407,...,-0.005783,0.059689,-0.120421,0.038111,0.047442,-0.009899,0.024311,0.060245,-0.020152,-0.093352
7,ES2003a,7,Ebenezer Ademesoye .,ME,3,0,0,2,"[-0.02129279, 0.05836254, -0.03032356, -0.0010...",-0.021293,...,0.081445,0.004677,-0.042932,-0.026396,-0.041985,0.000239,0.147455,0.062617,-0.004108,-0.037169
8,ES2003a,8,Would you like me to spell that ?,ME,8,1,0,0,"[-0.08815014, 0.03891213, -0.062674426, 0.0259...",-0.08815,...,0.143409,-0.031872,0.083132,0.008562,-0.006639,0.058942,0.174812,0.024079,0.063444,0.020977
9,ES2003a,9,S,ME,1,0,0,0,"[-0.078254215, -0.024671271, 0.027886607, -0.0...",-0.078254,...,0.110459,-0.043399,0.031995,-0.007827,-0.018499,0.023819,0.065264,0.005737,0.089214,-0.051413
