In [None]:
import pickle
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras import Model
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import keras.metrics
import tensorflow as tf
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from torchtext import data
import pandas as pd
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
def read_data(filename):
  dfile = open(filename, 'rb')     
  data = pickle.load(dfile)
  dfile.close()
  return data

In [None]:
X_train, Y_train, labels_train = read_data('TCR/data_train_tcr')
X_test, Y_test, labels_test = read_data('TCR/data_test_tcr')

In [None]:
unique_tokens = read_data('TCR/unique_tokens_tcr')

In [None]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 175
EMBEDDING_DIM = 300

VAL_SIZE = 0.15

In [None]:
unique_pos, unique_deps, unique_words = unique_tokens[0], unique_tokens[1], unique_tokens[2]

In [None]:
tokenizer1 = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer1.fit_on_texts(unique_pos)
word_index1 = tokenizer1.word_index

tokenizer2 = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer2.fit_on_texts(unique_words)
word_index2 = tokenizer2.word_index

tokenizer3 = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer3.fit_on_texts(unique_deps)
word_index3 = tokenizer3.word_index

In [None]:
#train
seq1 = tokenizer1.texts_to_sequences(X_train[0])
seq11 = pad_sequences(seq1, maxlen=MAX_SEQUENCE_LENGTH)

seq2 = tokenizer2.texts_to_sequences(X_train[2])
seq12 = pad_sequences(seq2, maxlen=MAX_SEQUENCE_LENGTH)

seq3 = tokenizer3.texts_to_sequences(X_train[1])
seq13 = pad_sequences(seq3, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
#test

seq1 = tokenizer1.texts_to_sequences(X_test[0])
seq11_test = pad_sequences(seq1, maxlen=MAX_SEQUENCE_LENGTH)

seq2 = tokenizer2.texts_to_sequences(X_test[2])
seq12_test = pad_sequences(seq2, maxlen=MAX_SEQUENCE_LENGTH)

seq3 = tokenizer3.texts_to_sequences(X_test[1])
seq13_test = pad_sequences(seq3, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# FOLDS 5-fold cross validation
# fold 1

nb_validation_samples = int(VAL_SIZE*seq11.shape[0])

fold1_x_train1 = seq11[:-nb_validation_samples]
fold1_x_train2 = seq12[:-nb_validation_samples]
fold1_x_train3 = seq13[:-nb_validation_samples]
fold1_y_train = Y_train[:-nb_validation_samples]
fold1_lab_train = labels_train[:-nb_validation_samples]

fold1_x_val1 = seq11[-nb_validation_samples:]
fold1_x_val2 = seq12[-nb_validation_samples:]
fold1_x_val3 = seq13[-nb_validation_samples:]
fold1_y_val = Y_train[-nb_validation_samples:]
fold1_lab_val = labels_train[-nb_validation_samples:]

In [None]:
# FOLD 2
# print(nb_validation_samples, seq11.shape, seq12.shape, seq13.shape)
fold2_x_train1 = np.concatenate((seq11[:-2*nb_validation_samples],seq11[-nb_validation_samples:]))
fold2_x_train2 = np.concatenate((seq12[:-2*nb_validation_samples],seq12[-nb_validation_samples:]))
fold2_x_train3 = np.concatenate((seq13[:-2*nb_validation_samples],seq13[-nb_validation_samples:]))
fold2_y_train = np.concatenate((Y_train[:-2*nb_validation_samples], Y_train[-nb_validation_samples:]))
fold2_lab_train = np.concatenate((labels_train[:-2*nb_validation_samples],labels_train[-nb_validation_samples:]))


fold2_x_val1 = seq11[-2*nb_validation_samples:-nb_validation_samples]
fold2_x_val2 = seq12[-2*nb_validation_samples:-nb_validation_samples]
fold2_x_val3 = seq13[-2*nb_validation_samples:-nb_validation_samples]
fold2_y_val = Y_train[-2*nb_validation_samples:-nb_validation_samples]
fold2_lab_val = labels_train[-2*nb_validation_samples:-nb_validation_samples]

In [None]:
#fold 3
fold3_x_train1 = np.concatenate((seq11[:-3*nb_validation_samples],seq11[-2*nb_validation_samples:]))
fold3_x_train2 = np.concatenate((seq12[:-3*nb_validation_samples],seq12[-2*nb_validation_samples:]))
fold3_x_train3 = np.concatenate((seq13[:-3*nb_validation_samples],seq13[-2*nb_validation_samples:]))
fold3_y_train = np.concatenate((Y_train[:-3*nb_validation_samples], Y_train[-2*nb_validation_samples:]))
fold3_lab_train = np.concatenate((labels_train[:-3*nb_validation_samples],labels_train[-2*nb_validation_samples:]))


fold3_x_val1 = seq11[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_x_val2 = seq12[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_x_val3 = seq13[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_y_val = Y_train[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_lab_val = labels_train[-3*nb_validation_samples:-2*nb_validation_samples]

In [None]:
#fold 4
fold4_x_train1 = np.concatenate((seq11[:-4*nb_validation_samples],seq11[-3*nb_validation_samples:]))
fold4_x_train2 = np.concatenate((seq12[:-4*nb_validation_samples],seq12[-3*nb_validation_samples:]))
fold4_x_train3 = np.concatenate((seq13[:-4*nb_validation_samples],seq13[-3*nb_validation_samples:]))
fold4_y_train = np.concatenate((Y_train[:-4*nb_validation_samples], Y_train[-3*nb_validation_samples:]))
fold4_lab_train = np.concatenate((labels_train[:-4*nb_validation_samples],labels_train[-3*nb_validation_samples:]))


fold4_x_val1 = seq11[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_x_val2 = seq12[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_x_val3 = seq13[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_y_val = Y_train[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_lab_val = labels_train[-4*nb_validation_samples:-3*nb_validation_samples]

In [None]:
# fold 5
fold5_x_train1 = seq11[nb_validation_samples:]
fold5_x_train2 = seq12[nb_validation_samples:]
fold5_x_train3 = seq13[nb_validation_samples:]
fold5_y_train = Y_train[nb_validation_samples:]
fold5_lab_train = labels_train[nb_validation_samples:]

fold5_x_val1 = seq11[0:nb_validation_samples]
fold5_x_val2 = seq12[0:nb_validation_samples]
fold5_x_val3 = seq13[0:nb_validation_samples]
fold5_y_val = Y_train[0:nb_validation_samples]
fold5_lab_val = labels_train[0:nb_validation_samples]

In [None]:
pos_vec = read_data('pos.vector')
dep_vec = read_data('deps.vector')

In [None]:
word_vec = {}
word_vec['PADDING'] = 300
f = open('glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_vec[word.lower()] = line
f.close()

In [None]:
# pos tags

embedding_matrix1 = np.zeros((len(word_index1) + 1, 28))
for word, i in word_index1.items():
    embedding_vector = pos_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix1[i] = np.asarray(embedding_vector.split()[1:], dtype='float32')

In [None]:
#word vec

embedding_matrix2 = np.zeros((len(word_index2) + 1, EMBEDDING_DIM))
for word, i in word_index2.items():
    embedding_vector = word_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix2[i] = np.asarray(embedding_vector.split()[1:], dtype='float32')

In [None]:
# deps vec

embedding_matrix3 = np.zeros((len(word_index3) + 1, len(dep_vec['PADDING'])))
for word, i in word_index3.items():
    embedding_vector = dep_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix3[i] = np.asarray(embedding_vector, dtype='float32')

In [None]:
def get_class_weights(training_labels):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(training_labels),training_labels)
    uni = list(np.unique(training_labels))

    labelset = ['CLINK', 'CLINK-R', 'O']

    weights = []

    for i in labelset:
      try:
        idx = uni.index(i)
        weights.append(class_weights[idx])
      except:
        weights.append(0)
    return weights



# To Extract Causal Features

In [None]:
def defineModel(l1,l2,l3,l4,d1,out,d):

    embedding_layer1 = Embedding(len(word_index2) + 1,EMBEDDING_DIM,weights=[embedding_matrix2],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer2 = Embedding(len(word_index1) + 1,28,weights=[embedding_matrix1],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer3 = Embedding(len(word_index3) + 1,77,weights=[embedding_matrix3],input_length=MAX_SEQUENCE_LENGTH,trainable=False)

    wi = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    wi2 = embedding_layer1(wi)

    pi_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    pi2_sen = embedding_layer2(pi_sen)

    di_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    di2_sen = embedding_layer3(di_sen)


    lstm1_sen = Bidirectional(LSTM(l1, activation='tanh', dropout=d, return_sequences=True), name = 'bid1causal_sen')(pi2_sen)  #  pos encoded features
    lstm2_sen = Bidirectional(LSTM(l2, activation='tanh', dropout=d, return_sequences=True), name= 'bid2causal_sen')(di2_sen)   #  dep features
    lstm3 = Bidirectional(LSTM(l4, activation='tanh', dropout=d+0.1, return_sequences=True), name = 'bid3causal')(wi2)  #  woed features

    hid_sen = concatenate([lstm1_sen, lstm2_sen, lstm3])    
    
    lstm5 = Bidirectional(LSTM(l4, activation='tanh', dropout=d), name = 'bid3causallstm2_sen')(hid_sen)

    yii = Dense(d1, activation='relu', name='dense1')(lstm5)
    yi = Dense(out, activation="softmax", name='dense2')(yii)
    model = Model(inputs=[pi_sen,di_sen,wi],outputs=yi)
    return model


In [None]:
def getfolddata(num):
  if num==1:
    return [fold1_x_train1,fold1_x_train3,fold1_x_train2], fold1_y_train, fold1_lab_train, [fold1_x_val1,fold1_x_val3,fold1_x_val2] , fold1_y_val, fold1_lab_val
  elif num==2:
    return [fold2_x_train1,fold2_x_train3,fold2_x_train2], fold2_y_train, fold2_lab_train, [fold2_x_val1,fold2_x_val3,fold2_x_val2] , fold2_y_val, fold2_lab_val
  elif num==3:
    return [fold3_x_train1,fold3_x_train3,fold3_x_train2], fold3_y_train, fold3_lab_train, [fold3_x_val1,fold3_x_val3,fold3_x_val2] , fold3_y_val, fold3_lab_val
  elif num==4:
    return [fold4_x_train1,fold4_x_train3,fold4_x_train2], fold4_y_train, fold4_lab_train, [fold4_x_val1,fold4_x_val3,fold4_x_val2] , fold4_y_val, fold4_lab_val 
  elif num==5:
    return [fold5_x_train1,fold5_x_train3,fold5_x_train2], fold5_y_train, fold5_lab_train, [fold5_x_val1,fold5_x_val3,fold5_x_val2] , fold5_y_val, fold5_lab_val

In [None]:
def trainModel():
    num_classes = 3

    epochs = 50
    batchsize = 64
    lrs = [0.001,0.1,0.1,0.001,0.01]
    drop = [0.1,0.2,0.3,0.3,0.2]

    file1 = 'TCR/chkpt/'
    
    out = num_classes

    for fold in [1,2,3,4,5]:
      checkpoint_filepath = file1 + f'model_causal_tcr_fold{fold}'
      training_data, y_train, training_labels, val_data, y_val, val_labels = getfolddata(fold)
      weights = get_class_weights(training_labels)


      set_nodes = [32, 32, 64, 64, 32]
      l1 = set_nodes[0]
      l2 = set_nodes[1]
      l3 = set_nodes[2]
      l4 = set_nodes[3]
      d1 = set_nodes[4]
      d = drop[fold-1]
      lr = lrs[fold-1]
      optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr) 

      model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=True,monitor='val_accuracy',mode='max',save_best_only=True)
      callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)

      model = defineModel(l1,l2,l3,l4,d1,out,d)
      model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'], loss_weights=weights)
      model.fit(x = training_data, y = y_train, epochs = epochs, batch_size = batchsize,validation_data=(val_data,y_val), callbacks=[callback, model_checkpoint_callback], verbose=0)
      model.load_weights(checkpoint_filepath)
      model.save(f"tcr_causal_fold{fold}.h5")
      del model
    

In [None]:
trainModel()

# To Extract Temporal Features


In [None]:
X_train_temp, Y_train_temp, labels_train_temp = read_data('TCR/data_train_temporal_tcr')

In [None]:
unique_tokens_temp = read_data('TCR/unique_tokens_temporal_tcr')

In [None]:
unique_pos_temp, unique_deps_temp, unique_words_temp = unique_tokens[0], unique_tokens[1], unique_tokens[2]

In [None]:
tokenizer1_temp = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer1_temp.fit_on_texts(unique_pos_temp)
word_index1_temp = tokenizer1_temp.word_index

tokenizer2_temp = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer2_temp.fit_on_texts(unique_words_temp)
word_index2_temp = tokenizer2_temp.word_index

tokenizer3_temp = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer3_temp.fit_on_texts(unique_deps_temp)
word_index3_temp = tokenizer3_temp.word_index

In [None]:
#train
seq1_temp = tokenizer1_temp.texts_to_sequences(X_train_temp[0])
seq11_temp = pad_sequences(seq1_temp, maxlen=MAX_SEQUENCE_LENGTH)

seq2_temp = tokenizer2_temp.texts_to_sequences(X_train_temp[2])
seq12_temp = pad_sequences(seq2_temp, maxlen=MAX_SEQUENCE_LENGTH)

seq3_temp = tokenizer3_temp.texts_to_sequences(X_train_temp[1])
seq13_temp = pad_sequences(seq3_temp, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# FOLDS 5-fold cross validation
# fold 1

nb_validation_samples = int(VAL_SIZE*seq11.shape[0])

fold1_x_train1_temp = seq11_temp[:-nb_validation_samples]
fold1_x_train2_temp = seq12_temp[:-nb_validation_samples]
fold1_x_train3_temp = seq13_temp[:-nb_validation_samples]
fold1_y_train_temp = Y_train_temp[:-nb_validation_samples]
fold1_lab_train_temp = labels_train_temp[:-nb_validation_samples]

fold1_x_val1_temp = seq11_temp[-nb_validation_samples:]
fold1_x_val2_temp = seq12_temp[-nb_validation_samples:]
fold1_x_val3_temp = seq13_temp[-nb_validation_samples:]
fold1_y_val_temp = Y_train_temp[-nb_validation_samples:]
fold1_lab_val_temp = labels_train_temp[-nb_validation_samples:]

In [None]:
# FOLD 2
# print(nb_validation_samples, seq11.shape, seq12.shape, seq13.shape)
fold2_x_train1_temp = np.concatenate((seq11_temp[:-2*nb_validation_samples],seq11_temp[-nb_validation_samples:]))
fold2_x_train2_temp = np.concatenate((seq12_temp[:-2*nb_validation_samples],seq12_temp[-nb_validation_samples:]))
fold2_x_train3_temp = np.concatenate((seq13_temp[:-2*nb_validation_samples],seq13_temp[-nb_validation_samples:]))
fold2_y_train_temp = np.concatenate((Y_train_temp[:-2*nb_validation_samples], Y_train_temp[-nb_validation_samples:]))
fold2_lab_train_temp = np.concatenate((labels_train_temp[:-2*nb_validation_samples],labels_train_temp[-nb_validation_samples:]))


fold2_x_val1_temp = seq11_temp[-2*nb_validation_samples:-nb_validation_samples]
fold2_x_val2_temp = seq12_temp[-2*nb_validation_samples:-nb_validation_samples]
fold2_x_val3_temp = seq13_temp[-2*nb_validation_samples:-nb_validation_samples]
fold2_y_val_temp = Y_train_temp[-2*nb_validation_samples:-nb_validation_samples]
fold2_lab_val_temp = labels_train_temp[-2*nb_validation_samples:-nb_validation_samples]

In [None]:
#fold 3
fold3_x_train1_temp = np.concatenate((seq11_temp[:-3*nb_validation_samples],seq11_temp[-2*nb_validation_samples:]))
fold3_x_train2_temp = np.concatenate((seq12_temp[:-3*nb_validation_samples],seq12_temp[-2*nb_validation_samples:]))
fold3_x_train3_temp = np.concatenate((seq13_temp[:-3*nb_validation_samples],seq13_temp[-2*nb_validation_samples:]))
fold3_y_train_temp = np.concatenate((Y_train_temp[:-3*nb_validation_samples], Y_train_temp[-2*nb_validation_samples:]))
fold3_lab_train_temp = np.concatenate((labels_train_temp[:-3*nb_validation_samples],labels_train_temp[-2*nb_validation_samples:]))


fold3_x_val1_temp = seq11_temp[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_x_val2_temp = seq12_temp[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_x_val3_temp = seq13_temp[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_y_val_temp = Y_train_temp[-3*nb_validation_samples:-2*nb_validation_samples]
fold3_lab_val_temp = labels_train_temp[-3*nb_validation_samples:-2*nb_validation_samples]

In [None]:
#fold 4
fold4_x_train1_temp = np.concatenate((seq11_temp[:-4*nb_validation_samples],seq11_temp[-3*nb_validation_samples:]))
fold4_x_train2_temp = np.concatenate((seq12_temp[:-4*nb_validation_samples],seq12_temp[-3*nb_validation_samples:]))
fold4_x_train3_temp = np.concatenate((seq13_temp[:-4*nb_validation_samples],seq13_temp[-3*nb_validation_samples:]))
fold4_y_train_temp = np.concatenate((Y_train_temp[:-4*nb_validation_samples], Y_train_temp[-3*nb_validation_samples:]))
fold4_lab_train_temp = np.concatenate((labels_train_temp[:-4*nb_validation_samples],labels_train_temp[-3*nb_validation_samples:]))


fold4_x_val1_temp = seq11_temp[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_x_val2_temp = seq12_temp[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_x_val3_temp = seq13_temp[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_y_val_temp = Y_train_temp[-4*nb_validation_samples:-3*nb_validation_samples]
fold4_lab_val_temp = labels_train_temp[-4*nb_validation_samples:-3*nb_validation_samples]

In [None]:
# fold 5
fold5_x_train1_temp = seq11_temp[nb_validation_samples:]
fold5_x_train2_temp = seq12_temp[nb_validation_samples:]
fold5_x_train3_temp = seq13_temp[nb_validation_samples:]
fold5_y_train_temp = Y_train_temp[nb_validation_samples:]
fold5_lab_train_temp = labels_train_temp[nb_validation_samples:]

fold5_x_val1_temp = seq11_temp[0:nb_validation_samples]
fold5_x_val2_temp = seq12_temp[0:nb_validation_samples]
fold5_x_val3_temp = seq13_temp[0:nb_validation_samples]
fold5_y_val_temp = Y_train_temp[0:nb_validation_samples]
fold5_lab_val_temp = labels_train_temp[0:nb_validation_samples]

In [None]:
def getfolddata_temp(num):
  if num==1:
    return [fold1_x_train1_temp,fold1_x_train3_temp,fold1_x_train2_temp], fold1_y_train_temp, fold1_lab_train_temp, [fold1_x_val1_temp,fold1_x_val3_temp,fold1_x_val2_temp] , fold1_y_val_temp, fold1_lab_val_temp
  elif num==2:
    return [fold2_x_train1_temp,fold2_x_train3_temp,fold2_x_train2_temp], fold2_y_train_temp, fold2_lab_train_temp, [fold2_x_val1_temp,fold2_x_val3_temp,fold2_x_val2_temp] , fold2_y_val_temp, fold2_lab_val_temp
  elif num==3:
    return [fold3_x_train1_temp,fold3_x_train3_temp,fold3_x_train2_temp], fold3_y_train_temp, fold3_lab_train_temp, [fold3_x_val1_temp,fold3_x_val3_temp,fold3_x_val2_temp] , fold3_y_val_temp, fold3_lab_val_temp
  elif num==4:
    return [fold4_x_train1_temp,fold4_x_train3_temp,fold4_x_train2_temp], fold4_y_train_temp, fold4_lab_train_temp, [fold4_x_val1_temp,fold4_x_val3_temp,fold4_x_val2_temp] , fold4_y_val_temp, fold4_lab_val_temp 
  elif num==5:
    return [fold5_x_train1_temp,fold5_x_train3_temp,fold5_x_train2_temp], fold5_y_train_temp, fold5_lab_train_temp, [fold5_x_val1_temp,fold5_x_val3_temp,fold5_x_val2_temp] , fold5_y_val_temp, fold5_lab_val_temp

In [None]:
# pos tags

embedding_matrix1 = np.zeros((len(word_index1) + 1, 28))
for word, i in word_index1.items():
    embedding_vector = pos_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix1[i] = np.asarray(embedding_vector.split()[1:], dtype='float32')

In [None]:
#word vec

embedding_matrix2 = np.zeros((len(word_index2) + 1, EMBEDDING_DIM))
for word, i in word_index2.items():
    embedding_vector = word_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix2[i] = np.asarray(embedding_vector.split()[1:], dtype='float32')

In [None]:
# deps vec

embedding_matrix3 = np.zeros((len(word_index3) + 1, len(dep_vec['PADDING'])))
for word, i in word_index3.items():
    embedding_vector = dep_vec.get(word)
    if embedding_vector is not None:
        embedding_matrix3[i] = np.asarray(embedding_vector, dtype='float32')

In [None]:
def get_class_weights_temp(training_labels):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(training_labels),training_labels)
    uni = list(np.unique(training_labels))

    labelset = ['AFTER', 'BEFORE', 'SIMULTANEOUS']

    weights = []

    for i in labelset:
      try:
        idx = uni.index(i)
        weights.append(class_weights[idx]*2)
      except:
        weights.append(0)
    return weights

In [None]:
def defineModel(l1,l2,l3,l4,d1,out,d):

    embedding_layer1 = Embedding(len(word_index2) + 1,EMBEDDING_DIM,weights=[embedding_matrix2],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer2 = Embedding(len(word_index1) + 1,28,weights=[embedding_matrix1],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer3 = Embedding(len(word_index3) + 1,77,weights=[embedding_matrix3],input_length=MAX_SEQUENCE_LENGTH,trainable=False)

    wi = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    wi2 = embedding_layer1(wi)

    pi_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    pi2_sen = embedding_layer2(pi_sen)

    di_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    di2_sen = embedding_layer3(di_sen)

    lstm1_sen = Bidirectional(LSTM(l1, activation='tanh', dropout=d, return_sequences=True), name = 'bid1temp_sen')(pi2_sen)  #  pos features
    lstm2_sen = Bidirectional(LSTM(l2, activation='tanh', dropout=d, return_sequences=True), name= 'bid2temp_sen')(di2_sen)   #  dep features
    lstm3 = Bidirectional(LSTM(l4, activation='tanh', dropout=d+0.1, return_sequences=True), name = 'bid3temp')(wi2)  #  woed features

    hid_sen = concatenate([lstm1_sen, lstm2_sen, lstm3])    
    
    lstm5 = Bidirectional(LSTM(l4, activation='tanh', dropout=d), name = 'bid3templstm2_sen')(hid_sen)

    yii = Dense(d1, activation='relu', name='dense1temp')(lstm5)
    yi = Dense(out, activation="softmax", name='dense2temp')(yii)
    model = Model(inputs=[pi_sen,di_sen,wi],outputs=yi)
    return model


In [None]:
def trainModel_temporal():
    num_classes = 3

    epochs = 50
    batchsize = 64
    lr = 0.005
    d = 0.3

    file1 = 'TCR/chkpt/'
    
    out = num_classes

    for fold in [1,2,3,4,5]:
      checkpoint_filepath = file1 + f'model_temp_tcr_fold{fold}'
      training_data, y_train, training_labels, val_data, y_val, val_labels = getfolddata_temp(fold)
      weights = get_class_weights_temp(training_labels)


      set_nodes = [32, 32, 64, 64, 32]
      l1 = set_nodes[0]
      l2 = set_nodes[1]
      l3 = set_nodes[2]
      l4 = set_nodes[3]
      d1 = set_nodes[4]
      optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 

      model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=True,monitor='val_accuracy',mode='max',save_best_only=True)
      callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)

      model = defineModel(l1,l2,l3,l4,d1,out,d)
      model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'], loss_weights=weights)
      model.fit(x = training_data, y = y_train, epochs = epochs, batch_size = batchsize,validation_data=(val_data,y_val), callbacks=[callback, model_checkpoint_callback])
      model.load_weights(checkpoint_filepath)
      model.save(f"tcr_temp_fold{fold}.h5")
      del model
    

In [None]:
trainModel_temporal()

# Joint Model for Causal Relation Classification

In [None]:
def get_class_weights(training_labels):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(training_labels),training_labels)
    uni = list(np.unique(training_labels))

    labelset = ['CLINK', 'CLINK-R', 'O']

    weights = []

    for i in labelset:
      try:
        idx = uni.index(i)
        weights.append(class_weights[idx])
      except:
        weights.append(0)
    return weights

In [None]:
def defineModel(l1,l2,l3,l4,d1,out,d):

    embedding_layer1 = Embedding(len(word_index2) + 1,EMBEDDING_DIM,weights=[embedding_matrix2],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer2 = Embedding(len(word_index1) + 1,28,weights=[embedding_matrix1],input_length=MAX_SEQUENCE_LENGTH,trainable=False)
    embedding_layer3 = Embedding(len(word_index3) + 1,77,weights=[embedding_matrix3],input_length=MAX_SEQUENCE_LENGTH,trainable=False)

    wi = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    wi2 = embedding_layer1(wi)

    pi_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    pi2_sen = embedding_layer2(pi_sen)

    di_sen = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    di2_sen = embedding_layer3(di_sen)

    lstm1temp = Bidirectional(LSTM(l1, activation='tanh', dropout=d, return_sequences=True), name = 'bid1temp_sen')(pi2_sen) 
    lstm1temp.trainable = False
    lstm2temp = Bidirectional(LSTM(l2, activation='tanh', dropout=d, return_sequences=True), name= 'bid2temp_sen')(di2_sen) 
    lstm2temp.trainable = False
    lstm3temp = Bidirectional(LSTM(l4, activation='tanh', dropout=d+0.1, return_sequences=True), name = 'bid3temp')(wi2) 
    lstm3temp.trainable = False

    hid_temp = concatenate([lstm1temp, lstm2temp, lstm3temp])   
    
    lstm4temp = Bidirectional(LSTM(l4, activation='tanh', dropout=d), name = 'bid3templstm2_sen')(hid_temp)
    lstm4temp.trainable = False

    lstm1causal = Bidirectional(LSTM(l1, activation='tanh', dropout=d, return_sequences=True), name = 'bid1causal_sen')(pi2_sen)
    lstm1causal.trainable = False
    lstm2causal = Bidirectional(LSTM(l2, activation='tanh', dropout=d, return_sequences=True), name= 'bid2causal_sen')(di2_sen)
    lstm2causal.trainable = False   
    lstm3causal = Bidirectional(LSTM(l3, activation='tanh', dropout=0.45, return_sequences=True), name = 'bid3causal')(wi2) 
    lstm3causal.trainable = False 


    hid_causal = concatenate([lstm1causal, lstm2causal, lstm3causal])   


    lstm4causal = Bidirectional(LSTM(l4, activation='tanh', dropout=d), name = 'bid3causallstm2_sen')(hid_causal)
    lstm4causal.trainable = False

    merged_features = concatenate([lstm4temp, lstm4causal])

    yii = Dense(d1, activation='relu', name='denselayer1')(merged_features)
    yi = Dense(out, activation="softmax", name='denselayer2')(yii)
    model = Model(inputs=[pi_sen,di_sen,wi],outputs=yi)
    return model


# Train

In [None]:
def trainModelJoint():
    num_classes = 3

    epochs = 50
    batchsizes = [ 64, 64, 128, 128, 128]  
    lrs = [0.001, 0.001, 0.001, 0.005, 0.01]
    drop = [0.1, 0.2, 0.1, 0.1, 0.1 ]

    file1 = 'TCR/chkpt/'
    
    out = num_classes

    for fold in [1,2,3,4,5]:
      checkpoint_filepath = file1 + f'model_joint_tcr_fold{fold}'
      training_data, y_train, training_labels, val_data, y_val, val_labels = getfolddata(fold)
      weights = get_class_weights_temp(training_labels)


      set_nodes = [32, 32, 64, 64, 32]
      l1 = set_nodes[0]
      l2 = set_nodes[1]
      l3 = set_nodes[2]
      l4 = set_nodes[3]
      d1 = set_nodes[4]
      lr = lrs[fold-1]
      d = drop[fold-1]
      batchsize = batchsizes[fold-1]

      optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 

      model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=True,monitor='val_accuracy',mode='max',save_best_only=True)
      callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)

      model = defineModel(l1,l2,l3,l4,d1,out,d)

      model.load_weights(f'TCR/tcr_causal_fold{fold}.h5', by_name =True) # to extract causal features
      model.load_weights(f'TCR/tcr_temp_fold{fold}.h5', by_name =True) # to extract temporal features

      model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'], loss_weights=weights)
      model.fit(x = training_data, y = y_train, epochs = epochs, batch_size = batchsize,validation_data=(val_data,y_val), callbacks=[callback, model_checkpoint_callback])
      model.load_weights(checkpoint_filepath)
      model.save(f"TCR/tcr_joint_fold{fold}.h5")
      del model
    

In [None]:
model = trainModelJoint()

In [None]:
#save all best models

# Best Model Results

In [None]:
def format_report(report, scores, accuracy, fold):
  # [ 'causes' ,'caused by', 'OTHER' ]
  print("")
  print(f"Test set result for fold {fold}")
  print(f"              {'{0:>10}'.format('precision')} {'{0:>10}'.format('recall')} {'{0:>10}'.format('f1-score')}")
  print(f"       causes {'{0:>10}'.format(round(report['0']['precision']*100.0, 1))} {'{0:>10}'.format(round(report['0']['recall']*100.0, 1))} {'{0:>10}'.format(round(report['0']['f1-score']*100.0, 1))}")
  print(f"    caused by {'{0:>10}'.format(round(report['1']['precision']*100.0, 1))} {'{0:>10}'.format(round(report['1']['recall']*100.0, 1))} {'{0:>10}'.format(round(report['1']['f1-score']*100.0, 1))}")
  print("")
  print(f"     accuracy {'{0:>10}'.format('')} {'{0:>10}'.format('')} {'{0:>10}'.format(round(accuracy*100, 1))}")
  print(f"    micro avg {'{0:>10}'.format(round(scores[0]*100.0, 1))} {'{0:>10}'.format(round(scores[1]*100.0, 1))} {'{0:>10}'.format(round(scores[2]*100.0, 1))}")

In [None]:
drop = [0.1, 0.2, 0.1, 0.1, 0.1 ]

for fold in [1,2,3,4,5]:
  
  model = defineModel(32,32,64,64,32,3, drop[fold-1])
  model.load_weights(f'TCR/tcr_joint_fold{fold}.h5', by_name=True)
  
  data_test = [seq11_test,seq13_test,seq12_test]
  classes = np.argmax(model.predict(x = data_test), axis=-1)
  y_test_classes = Y_test.argmax(1)
  y_pred_classes = classes

  accuracy = accuracy_score(y_test_classes, y_pred_classes)
  report = classification_report(y_true=y_test_classes, y_pred=y_pred_classes, zero_division=0, output_dict=True, digits= 3, labels=[0,1,2,3,4,5,6,7,8,9,10,11,12,13])
  scores = precision_recall_fscore_support(y_true=y_test_classes, y_pred=y_pred_classes, average='micro', labels=[0,1,2,3,4,5,6,7,8,9,10,11,12,13])
  format_report(report, scores, accuracy, fold)


Test set result for fold 1
               precision     recall   f1-score
       causes       76.2      100.0       86.5
    caused by      100.0       16.7       28.6

     accuracy                             77.3
    micro avg       77.3       77.3       77.3

Test set result for fold 2
               precision     recall   f1-score
       causes       83.3       93.8       88.2
    caused by       75.0       50.0       60.0

     accuracy                             81.8
    micro avg       81.8       81.8       81.8

Test set result for fold 3
               precision     recall   f1-score
       causes       85.7       93.8       89.6
    caused by       77.8       58.3       66.7

     accuracy                             84.1
    micro avg       84.1       84.1       84.1

Test set result for fold 4
               precision     recall   f1-score
       causes       76.2      100.0       86.5
    caused by      100.0       16.7       28.6

     accuracy                         