In [19]:
import json
from pathlib import Path
import numpy as np

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("data/training")
path_to_test = Path("data/test")

In [20]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [30]:
def get_transcript_and_graph(transcription_id):
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        json_transcription = json.load(file)

    N = len(json_transcription)
    transcription = np.empty((N,2), dtype='object')
    conv_graphe = np.empty((N,N), dtype='object')

    for utterance in json_transcription:
        transcription[int(utterance["index"]), 0] = utterance["speaker"]
        transcription[int(utterance["index"]), 1] = utterance["text"]
    
    with open(path_to_training / f"{transcription_id}.txt", "r") as graphe:
        for line in graphe:
            list = line.split()
            conv_graphe[int(list[0]), int(list[2])] = list[1]
    
    return transcription, conv_graphe

In [41]:
t, g = get_transcript_and_graph(training_set[0])
print("transcription")
print(t)
print("")
print("graphe")
print(g)

transcription
[['PM' 'Okay']
 ['PM' 'Right']
 ['PM'
  '<vocalsound> Um well this is the kick-off meeting for our our project .']
 ['PM' 'Um <vocalsound> and um']
 ['PM'
  "this is just what we're gonna be doing over the next twenty five minutes ."]
 ['PM'
  'Um so first of all , just to kind of make sure that we all know each other ,']
 ['PM' "I'm Laura and I'm the project manager ."]
 ['PM' '<vocalsound> Do you want to introduce yourself again ?']
 ['ME' 'Great .']
 ['ID' "Hi , I'm David and I'm supposed to be an industrial designer ."]
 ['PM' 'Okay .']
 ['ME' "And I'm Andrew and I'm uh our marketing"]
 ['UI' "Um I'm Craig and I'm User Interface ."]
 ['ME' 'expert .']
 ['PM' 'Great .']
 ['PM' 'Okay . <vocalsound> Um']
 ['PM' "so we're designing a new remote control and um <disfmarker>"]
 ['PM' "Oh I have to record who's here actually ."]
 ['PM' "So that's David , Andrew and Craig , isn't it ?"]
 ['PM' 'And you all arrived on time .']
 ['PM' 'Um yeah so des uh <vocalsound> design a new

In [42]:
#dont run that until you want to lose 15 min!!!!!
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')


In [49]:
bert.encode("hello how are you and you").shape

(384,)

In [None]:

X_training = bert.encode(X_training, show_progress_bar=True)

In [None]:

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
    
    X_test = bert.encode(X_test)

    y_test = clf.predict(X_test)
    test_labels[transcription_id] = y_test.tolist()

with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)
