In [58]:
import json
from pathlib import Path
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm 
import pandas as pd

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("data/training")
path_to_test = Path("data/test")

In [59]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [86]:
# Initialize a list to store DataFrame data
data = []

# Iterate through each file in the training_set
for file_name in training_set:
    # Load the JSON file
    json_path = Path(f'data/training/{file_name}.json')
    with open(json_path, 'r') as json_file:
        conversation_data = json.load(json_file)

    # Load the TXT file
    txt_path = Path(f'data/training/{file_name}.txt')
    with open(txt_path, 'r') as txt_file:
        lines = txt_file.readlines()

        # Iterate through the lines of the TXT file
        for line in lines:
            # Extract the start index from the TXT line
            start_index, _, _ = map(str.strip, line.split(' '))
            start_index = int(start_index)

            # Retrieve corresponding data from the JSON file using the start index
            start_sentence = next(item['text'] for item in conversation_data if item['index'] == start_index)

            # Extract the middle word from the line as the "link"
            middle_word = line.split(' ')[1]

            # Add data to the list
            data.append({
                'speaker': conversation_data[start_index]['speaker'],
                'sentence': start_sentence,
                'index': start_index,
                'link': middle_word,
                'end': int(line.split(' ')[-1]),
                'dialog_id': file_name
            })

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Display the DataFrame
df.head(50)

Unnamed: 0,speaker,sentence,index,link,end,dialog_id
0,PM,Okay,0,Continuation,1,ES2002a
1,PM,Right,1,Continuation,2,ES2002a
2,PM,<vocalsound> Um well this is the kick-off meet...,2,Explanation,3,ES2002a
3,PM,Um <vocalsound> and um,3,Elaboration,4,ES2002a
4,PM,this is just what we're gonna be doing over th...,4,Continuation,5,ES2002a
5,PM,"Um so first of all , just to kind of make sure...",5,Elaboration,6,ES2002a
6,PM,I'm Laura and I'm the project manager .,6,Continuation,7,ES2002a
7,PM,<vocalsound> Do you want to introduce yourself...,7,Acknowledgement,8,ES2002a
8,PM,"Um so first of all , just to kind of make sure...",5,Continuation,9,ES2002a
9,ID,"Hi , I'm David and I'm supposed to be an indus...",9,Acknowledgement,10,ES2002a


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72526 entries, 0 to 72525
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   speaker    72526 non-null  object
 1   sentence   72526 non-null  object
 2   index      72526 non-null  int64 
 3   link       72526 non-null  object
 4   end        72526 non-null  int64 
 5   dialog_id  72526 non-null  object
dtypes: int64(2), object(4)
memory usage: 3.3+ MB
