In [2]:
import json
from pathlib import Path
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm 

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("data/training")
path_to_test = Path("data/test")

In [5]:
#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [6]:
graph_links_labels= set()
for id in training_set:
    with open(path_to_training / f"{id}.txt", "r") as graphe:
        for line in graphe:
            l = line.split()
            graph_links_labels.add(l[1])
L = list(graph_links_labels)
int2label = {indice: valeur for indice, valeur in enumerate(L)}
label2int = {valeur: indice for indice, valeur in enumerate(L)}
label2int
N_vocab_links = len(L)
print(N_vocab_links)

16


In [7]:
def feature_extract_train(transcription_id):
    path_to_training = Path("data/training")
    bert_array = np.load('feature-extraction/bert/training/' + transcription_id +'.npy')
    N_nodes = bert_array.shape[0]
    edges = np.zeros(shape=(2*N_vocab_links,N_nodes,N_nodes),dtype=bool)
    with open(path_to_training / f"{transcription_id}.txt", "r") as graphe:
        for line in graphe:
            l = line.split()
            i = int(l[0])
            j =  int(l[2])
            edge_type = label2int[l[1]] - 1
            #print('from', i, 'to', j, 'type', edge_type)
            edges[edge_type,i,j] = True
            edges[N_vocab_links + edge_type, j, i] = True

    return bert_array, edges

def get_labels(transcription_id):
    with open("data/training_labels.json", "r") as file:
        training_labels = json.load(file)
    return np.array(training_labels[transcription_id]).reshape(-1,1)

In [8]:
X_train_nodes = []
X_train_edges = []
y_train = []
for id in training_set:
    nodes, edges = feature_extract_train(id)
    y = get_labels(id)
    X_train_nodes.append(nodes)
    X_train_edges.append(edges)
    y_train.append(y)


In [9]:
import tensorflow as tf
from spektral.layers import GCNConv

def create_model(nb_canaux, dim_embedding, dim_post_conv, nb_nodes=None):

    node_features = tf.keras.Input(shape=(nb_nodes, dim_embedding), name="node_features")
    edge_features = tf.keras.Input(shape=(nb_canaux, nb_nodes, nb_nodes), name="edge_features")

    graph_conv_outputs = []
    for i in range(nb_canaux):
        adjacency_matrix = edge_features[:,i,:,:]
        GCN = GCNConv(dim_post_conv, activation='relu', name="GCN_"+str(i))
        graph_conv_output = GCN([node_features, adjacency_matrix])
        graph_conv_outputs.append(graph_conv_output)

    concatenated = tf.keras.layers.Concatenate(axis=2, name='concat')(graph_conv_outputs)
    end = tf.keras.layers.Dense(200)(concatenated)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(end)

    model = tf.keras.Model(inputs=[node_features, edge_features], outputs=output)
    return model


model = create_model(2*N_vocab_links, 384, 10, None)

model.compile(optimizer='adam',  
              loss='binary_crossentropy',  
              metrics=['accuracy'])


In [11]:
print(X_train_nodes[0].shape)
print(X_train_edges[0].shape)
print(y_train[0].shape)

X_train_nodes = np.random.random((100, 5, 384))
X_train_edges = np.random.random((100, 5, 384))
y_train = np.random.randint(2, size=(100, 5, 1))

model.fit([X_train_nodes, X_train_edges], y_train, epochs=2,batch_size=50)

(396, 384)
(32, 396, 396)
(396, 1)


ValueError: Data cardinality is ambiguous:
  x sizes: 396, 897, 924, 1207, 126, 815, 1057, 669, 403, 806, 869, 1047, 415, 547, 765, 518, 339, 856, 860, 1364, 791, 593, 785, 1082, 229, 648, 722, 563, 345, 738, 915, 450, 248, 672, 718, 755, 357, 973, 960, 1143, 481, 713, 557, 647, 585, 838, 722, 1377, 254, 772, 515, 379, 782, 754, 497, 358, 641, 869, 1095, 196, 672, 839, 886, 249, 614, 572, 357, 838, 870, 1148, 336, 422, 667, 717, 542, 1318, 1156, 2160, 471, 901, 934, 1328, 714, 1207, 1275, 1215, 212, 524, 689, 649, 486, 985, 1017, 911, 373, 1487, 1283, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
  y sizes: 100
Make sure all arrays contain the same number of samples.

In [None]:

def create_model(nb_canaux, dim_embedding, dim_post_conv, nb_nodes=None):

    imput = tf.keras.Input(shape=(nb_nodes, nb_nodes, dim_embedding,nb_canaux), name="imput")
    nodes = imput[:,:,0,:,0]

    graph_conv_outputs = []
    for i in range(nb_canaux):
        adjacency_matrix = imput[:,:,:,0,i]
        GCN = GCNConv(dim_post_conv, activation='relu', name="GCN_"+str(i))
        graph_conv_output = GCN([nodes, adjacency_matrix])
        graph_conv_outputs.append(graph_conv_output)

    concatenated = tf.keras.layers.Concatenate(axis=2, name='concat')(graph_conv_outputs)
    end = tf.keras.layers.Dense(200)(concatenated)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(end)

    model = tf.keras.Model(inputs=imput, outputs=output)
    return model