In [None]:
import json
from pathlib import Path
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm 

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("data/training")
path_to_test = Path("data/test")

In [None]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [None]:
graph_links_labels= set()
for id in training_set:
    with open(path_to_training / f"{id}.txt", "r") as graphe:
        for line in graphe:
            l = line.split()
            graph_links_labels.add(l[1])
L = list(graph_links_labels)
int2label = {indice: valeur for indice, valeur in enumerate(L)}
label2int = {valeur: indice for indice, valeur in enumerate(L)}
label2int
N_vocab_links = len(L)
print(N_vocab_links)

16


In [None]:
def feature_extract_train(transcription_id):
    path_to_training = Path("data/training")
    bert_array = np.load('feature-extraction/bert/training/' + transcription_id +'.npy')
    N_nodes = bert_array.shape[0]
    edges = np.zeros(shape=(2*N_vocab_links,N_nodes,N_nodes),dtype=bool)
    with open(path_to_training / f"{transcription_id}.txt", "r") as graphe:
        for line in graphe:
            l = line.split()
            i = int(l[0])
            j =  int(l[2])
            edge_type = label2int[l[1]] - 1
            #print('from', i, 'to', j, 'type', edge_type)
            edges[edge_type,i,j] = True
            edges[N_vocab_links + edge_type, j, i] = True

    return bert_array, edges

def get_labels(transcription_id):
    with open("data/training_labels.json", "r") as file:
        training_labels = json.load(file)
    return np.array(training_labels[transcription_id]).reshape(-1,1)

In [None]:
from scipy.sparse import csr_matrix

def extract_training(N_nodes_max):

    N_files = len(training_set)
    D_embedding = 384

    # set definitons
    X_train_nodes = []
    X_train_edges = []
    y_train = []

    # node features
    for k, transcription_id in enumerate(training_set):
        x = np.zeros(shape=(N_nodes_max, D_embedding))
        bert_array = np.load('feature-extraction/bert/training/' + transcription_id +'.npy')
        x[0:(bert_array.shape[0]), :] = bert_array
        X_train_nodes.append(x)

    # edges features
    path_to_training = Path("data/training")
    for k, transcription_id in enumerate(training_set):
        x = np.zeros(shape=(16, N_nodes_max, N_nodes_max), dtype=np.int8)
        with open(path_to_training / f"{transcription_id}.txt", "r") as graphe:
            for line in graphe:
                l = line.split()
                i = int(l[0])
                j =  int(l[2])
                edge_type = label2int[l[1]] - 1
                x[edge_type,i,j] = 1
        X_train_edges.append(x)

    # labels
    for k, transcription_id in enumerate(training_set):
        y = np.zeros((N_nodes_max,1))
        with open("data/training_labels.json", "r") as file:
            training_labels = json.load(file)
        y[0:(len(training_labels[transcription_id])),:] = np.array(training_labels[transcription_id]).reshape(-1,1)
        y_train.append(y)
    
    return X_train_nodes, X_train_edges, y_train

In [None]:
X_train_nodes, X_train_edges, y_train = extract_training(3000)

MemoryError: Unable to allocate 208. GiB for an array with shape (97, 32, 3000, 3000) and data type float64

In [None]:
import tensorflow as tf
from spektral.layers import GCNConv
from sklearn.metrics import f1_score
from keras import backend as K


def create_model(nb_canaux, dim_embedding, dim_post_conv, nb_nodes=None):

    node_features = tf.keras.Input(shape=(nb_nodes, dim_embedding), name="node_features")
    edge_features = tf.keras.Input(shape=(nb_canaux, nb_nodes, nb_nodes), name="edge_features")

    graph_conv_outputs = []
    for i in range(nb_canaux):
        adjacency_matrix = edge_features[:,i,:,:]
        GCN = GCNConv(dim_post_conv, activation='relu', name="GCN_"+str(i))
        graph_conv_output = GCN([node_features, adjacency_matrix])
        graph_conv_outputs.append(graph_conv_output)

    concatenated = tf.keras.layers.Concatenate(axis=2, name='concat')(graph_conv_outputs)
    end = tf.keras.layers.Dense(200)(concatenated)
    output = tf.keras.layers.Dense(2, activation='softmax')(end)

    model = tf.keras.Model(inputs=[node_features, edge_features], outputs=output)
    
    # Define custom F1-score metric
    def f1_metric(y_true, y_pred):
        y_pred = K.round(y_pred)
        f1 = f1_score(K.eval(y_true), K.eval(y_pred), average='binary')
        return f1
    
    model.compile(optimizer='adam',  
              loss='binary_crossentropy',  
              metrics=['accuracy'],
              run_eagerly=True)

    return model



In [None]:
X_train_edges_np = np.stack(X_train_edges, axis=0)
X_train_nodes_np = np.stack(X_train_nodes, axis=0)

from keras.utils import to_categorical

y_train_np = to_categorical(np.stack(y_train, axis=0), num_classes=2)

print(X_train_nodes_np.shape)
print(X_train_edges_np.shape)
print(y_train_np.shape)

X_train_nodes_rd = np.random.random((100, 5, 384))
X_train_edges_rd = np.random.random((100, 32, 5, 5))
y_train_rd = to_categorical(np.random.randint(2, size=(100, 5, 1)))

print('random')
print(X_train_nodes_rd.shape)
print(X_train_edges_rd.shape)
print(y_train_rd.shape)

2160

In [None]:
model = create_model(32, 384, 10, 5)


model.fit([X_train_nodes_rd, X_train_edges_rd], y_train_rd, epochs=2)

(396, 384)
(32, 396, 396)
(396, 1)
random
(5, 384)
(32, 5, 5)
(5, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x18938e7c8e0>

In [None]:
model = create_model(16, 384, 200, 3000)


hist = model.fit([X_train_nodes_np[:10,:,:], X_train_edges_np[:10,:,:,:]], y_train_np[:10,:,:], epochs=10)

In [None]:
p = model.predict([X_train_nodes_np[3:4,:,:], X_train_edges_np[3:4,:,:,:]])
p.max(axis=1)

In [None]:
r = np.argmax(p, axis=2)
r

In [None]:
r.max()