In [13]:
import json
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Function to read transcription data
def read_transcription(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to read discourse graph data
def read_discourse_graph(file_path):
    with open(file_path, 'r') as file:
        data = [line.strip().split() for line in file]

    # Convert non-numeric values to numeric indices
    data = [(int(start), relation, int(end)) if start.isdigit() and end.isdigit() else (start, relation, end) for start, relation, end in data]

    return data

def create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict):
    rows = []

    # Iterate through all edges in the discourse graph
    for edge in discourse_graph:
        index_start, relation_type, index_end = edge

        # Retrieve speaker information
        speaker = transcription[index_start]['speaker']

        # Convert relation type to integer using the dictionary
        speaker_id = speaker_dict.get(speaker, -1)

        # Retrieve the sentence
        text = transcription[index_start]['text']

        # Convert relation type to integer using the dictionary
        relation_type_id = relation_dict.get(relation_type, -1)

        # Add a row to the DataFrame
        rows.append({
            'dialogue_id': dialogue_id,
            'index_start': index_start,
            'text': text,
            'index_end': index_end,
            'speaker_type': speaker_id,
            'speaker_text': speaker,
            'relation_type': relation_type_id,
            'relation_text': relation_type
        })

    # Create the DataFrame
    df = pd.DataFrame(rows)

    return df

# Function to create the relation conversion dictionary
def create_relation_dict(discourse_graph):
    relation_set = set()

    # Collect all unique relations
    for edge in discourse_graph:
        relation_set.add(edge[1])

    # Create a conversion dictionary
    relation_dict = {relation: idx for idx, relation in enumerate(relation_set)}

    return relation_dict

# Function to create the speaker conversion dictionary
def create_speaker_dict(transcription):
    speaker_set = set()

    # Collect all unique speakers
    for utterance in transcription:
        speaker_set.add(utterance['speaker'])

    # Create a conversion dictionary
    speaker_dict = {speaker: idx for idx, speaker in enumerate(speaker_set)}

    return speaker_dict

# Function to get labels for a dialogue
def get_label(dialogue_id, index,labels_data):
    return labels_data.get(dialogue_id, [])[index]

# Function to load BERT embeddings from .npy files
def load_bert_embeddings(dialogue_id):
    file_path = f'feature-extraction/bert/training/{dialogue_id}.npy'
    return np.load(file_path)

# Replace 'your_path' with the correct path
path_to_training = Path("data/training")
path_to_test = Path("data/test")

# List to store DataFrames for each dialogue
dfs = []

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

# Replace 'your_dialogue_ids' with your actual list of dialogue identifiers
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

# List to store DataFrames for each dialogue
dfs = []

# Iterate through each dialogue
for dialogue_id in training_set:
    # Read transcription and discourse graph data
    transcription = read_transcription(path_to_training / f'{dialogue_id}.json')
    discourse_graph = read_discourse_graph(path_to_training / f'{dialogue_id}.txt')

    # Create the relation conversion dictionary
    relation_dict = create_relation_dict(discourse_graph)
    speaker_dict = create_speaker_dict(transcription)

    # Create the DataFrame for the current dialogue
    df_dialogue = create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict)

    # Add the DataFrame to the list
    dfs.append(df_dialogue)

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Load the training labels data
with open("data/training_labels.json", 'r') as file:
    labels_data = json.load(file)
    
df['label'] = df.apply(lambda row: get_label(row['dialogue_id'], row['index_start'], labels_data), axis=1)

# Display the final DataFrame
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72526 entries, 0 to 72525
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   dialogue_id    72526 non-null  object
 1   index_start    72526 non-null  int64 
 2   text           72526 non-null  object
 3   index_end      72526 non-null  int64 
 4   speaker_type   72526 non-null  int64 
 5   speaker_text   72526 non-null  object
 6   relation_type  72526 non-null  int64 
 7   relation_text  72526 non-null  object
 8   label          72526 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 5.0+ MB


Unnamed: 0,dialogue_id,index_start,text,index_end,speaker_type,speaker_text,relation_type,relation_text,label
0,ES2002a,0,Okay,1,2,PM,10,Continuation,0
1,ES2002a,1,Right,2,2,PM,10,Continuation,0
2,ES2002a,2,<vocalsound> Um well this is the kick-off meet...,3,2,PM,2,Explanation,1
3,ES2002a,3,Um <vocalsound> and um,4,2,PM,7,Elaboration,0
4,ES2002a,4,this is just what we're gonna be doing over th...,5,2,PM,10,Continuation,0


In [16]:
test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

# List to store DataFrames for each dialogue
dfs_test = []

# Iterate through each dialogue
for dialogue_id in test_set:
    # Read transcription and discourse graph data
    transcription = read_transcription(path_to_test / f'{dialogue_id}.json')
    discourse_graph = read_discourse_graph(path_to_test / f'{dialogue_id}.txt')

    # Create the relation conversion dictionary
    relation_dict = create_relation_dict(discourse_graph)
    speaker_dict = create_speaker_dict(transcription)

    # Create the DataFrame for the current dialogue
    df_dialogue = create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict)

    # Add the DataFrame to the list
    dfs_test.append(df_dialogue)

# Concatenate all DataFrames into one
df_test = pd.concat(dfs_test, ignore_index=True)

# Display the final DataFrame
df_test.info()
df_test.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30986 entries, 0 to 30985
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   dialogue_id    30986 non-null  object
 1   index_start    30986 non-null  int64 
 2   text           30986 non-null  object
 3   index_end      30986 non-null  int64 
 4   speaker_type   30986 non-null  int64 
 5   speaker_text   30986 non-null  object
 6   relation_type  30986 non-null  int64 
 7   relation_text  30986 non-null  object
dtypes: int64(4), object(4)
memory usage: 1.9+ MB


Unnamed: 0,dialogue_id,index_start,text,index_end,speaker_type,speaker_text,relation_type,relation_text
0,ES2003a,0,"Okay , well",1,2,PM,11,Continuation
1,ES2003a,0,"Okay , well",2,2,PM,11,Continuation
2,ES2003a,2,"Right ,",3,2,PM,0,Comment
3,ES2003a,3,"my name's Adam Duguid ,",4,2,PM,2,Explanation
4,ES2003a,4,"we're here because of real reaction ,",5,2,PM,2,Explanation


In [2]:
df['dif_start_end'] = df['index_end']-df['index_start']
df['word_count'] = df['text'].apply(lambda x: len(x.split(' ')))
df['nb_long_words'] = df['text'].apply(lambda x: sum(len(word) > 4 for word in x.split()))

In [12]:
X_train = df[['word_count','nb_long_words']]
y_train = df[['label']]

In [22]:
df_test['dif_start_end'] = df_test['index_end']-df_test['index_start']
df_test['word_count'] = df_test['text'].apply(lambda x: len(x.split(' ')))
df_test['nb_long_words'] = df_test['text'].apply(lambda x: sum(len(word) > 4 for word in x.split()))

In [23]:
X_test = df_test[['word_count','nb_long_words']]

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pandas as pd

# Model k-Nearest Neighbors (kNN)
knn_classifier = KNeighborsClassifier()

# Ajuster le modèle
knn_classifier.fit(X_train, y_train)

# Prédictions
y_pred = knn_classifier.predict(X_test)

ValueError: Found input variables with inconsistent numbers of samples: [29010, 58020]

In [17]:
test_labels = {}
for transcription_id in test_set:
    test_labels[transcription_id] = y_pred.tolist()

with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)
