In [17]:
import json
import pandas as pd
from pathlib import Path

# Function to read transcription data
def read_transcription(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to read discourse graph data
def read_discourse_graph(file_path):
    with open(file_path, 'r') as file:
        data = [line.strip().split() for line in file]

    # Convert non-numeric values to numeric indices
    data = [(int(start), relation, int(end)) if start.isdigit() and end.isdigit() else (start, relation, end) for start, relation, end in data]

    return data

def create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict):
    rows = []

    # Iterate through all edges in the discourse graph
    for edge in discourse_graph:
        index_start, relation_type, index_end = edge

        # Retrieve speaker information
        speaker = transcription[index_start]['speaker']

        # Convert relation type to integer using the dictionary
        speaker_id = speaker_dict.get(speaker, -1)

        # Retrieve the sentence
        text = transcription[index_start]['text']

        # Convert relation type to integer using the dictionary
        relation_type_id = relation_dict.get(relation_type, -1)

        # Add a row to the DataFrame
        rows.append({
            'dialogue_id': dialogue_id,
            'index_start': index_start,
            'text': text,
            'index_end': index_end,
            'speaker_type': speaker_id,
            'speaker_text': speaker,
            'relation_type': relation_type_id,
            'relation_text': relation_type
        })

    # Create the DataFrame
    df = pd.DataFrame(rows)

    return df

# Function to create the relation conversion dictionary
def create_relation_dict(discourse_graph):
    relation_set = set()

    # Collect all unique relations
    for edge in discourse_graph:
        relation_set.add(edge[1])

    # Create a conversion dictionary
    relation_dict = {relation: idx for idx, relation in enumerate(relation_set)}

    return relation_dict

# Function to create the speaker conversion dictionary
def create_speaker_dict(transcription):
    speaker_set = set()

    # Collect all unique speakers
    for utterance in transcription:
        speaker_set.add(utterance['speaker'])

    # Create a conversion dictionary
    speaker_dict = {speaker: idx for idx, speaker in enumerate(speaker_set)}

    return speaker_dict

# Replace 'your_path' with the correct path
path_to_training = Path("data/training")

# List to store DataFrames for each dialogue
dfs = []

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

# Replace 'your_dialogue_ids' with your actual list of dialogue identifiers
dialogue_ids = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
dialogue_ids = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in dialogue_ids])
dialogue_ids.remove('IS1002a')
dialogue_ids.remove('IS1005d')
dialogue_ids.remove('TS3012c')

# List to store DataFrames for each dialogue
dfs = []

# Iterate through each dialogue
for dialogue_id in dialogue_ids:
    # Read transcription and discourse graph data
    transcription = read_transcription(path_to_training / f'{dialogue_id}.json')
    discourse_graph = read_discourse_graph(path_to_training / f'{dialogue_id}.txt')

    # Create the relation conversion dictionary
    relation_dict = create_relation_dict(discourse_graph)
    speaker_dict = create_speaker_dict(transcription)

    # Create the DataFrame for the current dialogue
    df_dialogue = create_dataframe(dialogue_id, transcription, discourse_graph, relation_dict, speaker_dict)
    
    # Add the DataFrame to the list
    dfs.append(df_dialogue)

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Display the final DataFrame
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72526 entries, 0 to 72525
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   dialogue_id    72526 non-null  object
 1   index_start    72526 non-null  int64 
 2   text           72526 non-null  object
 3   index_end      72526 non-null  int64 
 4   speaker_type   72526 non-null  int64 
 5   speaker_text   72526 non-null  object
 6   relation_type  72526 non-null  int64 
 7   relation_text  72526 non-null  object
dtypes: int64(4), object(4)
memory usage: 4.4+ MB


Unnamed: 0,dialogue_id,index_start,text,index_end,speaker_type,speaker_text,relation_type,relation_text
0,ES2002a,0,Okay,1,3,PM,7,Continuation
1,ES2002a,1,Right,2,3,PM,7,Continuation
2,ES2002a,2,<vocalsound> Um well this is the kick-off meet...,3,3,PM,9,Explanation
3,ES2002a,3,Um <vocalsound> and um,4,3,PM,5,Elaboration
4,ES2002a,4,this is just what we're gonna be doing over th...,5,3,PM,7,Continuation
5,ES2002a,5,"Um so first of all , just to kind of make sure...",6,3,PM,5,Elaboration
6,ES2002a,6,I'm Laura and I'm the project manager .,7,3,PM,7,Continuation
7,ES2002a,7,<vocalsound> Do you want to introduce yourself...,8,3,PM,6,Acknowledgement
8,ES2002a,5,"Um so first of all , just to kind of make sure...",9,3,PM,7,Continuation
9,ES2002a,9,"Hi , I'm David and I'm supposed to be an indus...",10,0,ID,6,Acknowledgement


In [29]:
# Function to get labels for a dialogue
def get_label(dialogue_id, index,labels_data):
    return labels_data.get(dialogue_id, [])[index]

# Load the training labels data
with open("data/training_labels.json", 'r') as file:
    labels_data = json.load(file)

df['label'] = df.apply(lambda row: get_label(row['dialogue_id'], row['index_start'], labels_data), axis=1)

In [30]:
df.head(25)

Unnamed: 0,dialogue_id,index_start,text,index_end,speaker_type,speaker_text,relation_type,relation_text,label
0,ES2002a,0,Okay,1,3,PM,7,Continuation,0
1,ES2002a,1,Right,2,3,PM,7,Continuation,0
2,ES2002a,2,<vocalsound> Um well this is the kick-off meet...,3,3,PM,9,Explanation,1
3,ES2002a,3,Um <vocalsound> and um,4,3,PM,5,Elaboration,0
4,ES2002a,4,this is just what we're gonna be doing over th...,5,3,PM,7,Continuation,0
5,ES2002a,5,"Um so first of all , just to kind of make sure...",6,3,PM,5,Elaboration,0
6,ES2002a,6,I'm Laura and I'm the project manager .,7,3,PM,7,Continuation,0
7,ES2002a,7,<vocalsound> Do you want to introduce yourself...,8,3,PM,6,Acknowledgement,0
8,ES2002a,5,"Um so first of all , just to kind of make sure...",9,3,PM,7,Continuation,0
9,ES2002a,9,"Hi , I'm David and I'm supposed to be an indus...",10,0,ID,6,Acknowledgement,0
