In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, concatenate, Dense, Flatten, Embedding, Bidirectional, Dropout, GlobalMaxPooling1D, BatchNormalization, SpatialDropout1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

2024-04-04 15:22:03.896703: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Import dataset
import pandas as pd
from IPython.display import display
from sklearn.utils import shuffle

df_1 = pd.read_csv(r"../model_training/training_data/IBM30K_shuffled_n.csv")
df_2 = pd.read_csv(r"../model_training/training_data/AIFdb_Toni.csv")

df = pd.concat([df_1, df_2], axis=0)

df = shuffle(df).reset_index(drop=True)


# Split features and labels
X = df[["argument1", "argument2"]].T.reset_index(drop=True).T # Assume output is given
y = df[["relationship"]].T.reset_index(drop=True).T
display(X)
display(y)

Unnamed: 0,0,1
0,"""he is no king""","""The president is not allowed by the law to co..."
1,"""That's an interesting subject, I had not hear...","""That happens, too. For example, our gene for ..."
2,"""К5 Стало быть, вы идете против своего народа?""","""Б5 И все-таки это ничего не доказывает."""
3,judicial activism subverts the law and obstruc...,We should limit judicial activism
4,zero tolerance deters children from misbehavin...,We should adopt a zero-tolerance policy in sch...
...,...,...
81445,"""Are you?""","""Are you an `ergonomics' expert? No, you are n..."
81446,"""В этом есть доля правды, так же как и в том, ...","""Суд внимательно изучил представленные материа..."
81447,"""So, had I used the lock, that crime would not...","""In reality, those who are 'retarded' can do n..."
81448,entrapment allows law enforcement to pick up c...,Entrapment should be legalized


Unnamed: 0,0
0,s
1,s
2,a
3,s
4,s
...,...
81445,a
81446,s
81447,n
81448,s


In [3]:
# Split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split pairs of sentences
sentences_train_1 = X_train[0].to_numpy()
sentences_train_2 = X_train[1].to_numpy()

sentences_test_1 = X_test[0].to_numpy()
sentences_test_2 = X_test[1].to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# prepare target
le = LabelEncoder()
le.fit(np.ravel(y))
y_train_enc = le.transform(np.ravel(y_train))
y_test_enc = le.transform(np.ravel(y_test))
print(y_test_enc)

# one hot encoded
y_train_enc = to_categorical(y_train_enc)
y_test_enc = to_categorical(y_test_enc)

print(y_train_enc, y_test_enc)

[1 0 0 ... 1 2 1]
[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]] [[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Flatten features for Glove fitting
texts = np.concatenate([X[0], X[1]])

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
# Encode training data
sequences_train_1 = tokenizer.texts_to_sequences(sentences_train_1)
sequences_train_2 = tokenizer.texts_to_sequences(sentences_train_2)

# Encode testing data
sequences_test_1 = tokenizer.texts_to_sequences(sentences_test_1)
sequences_test_2 = tokenizer.texts_to_sequences(sentences_test_2)

# Padding sequences to have the same length
max_len = 50
print(max_len)

padded_sequences_train_1 = pad_sequences(sequences_train_1, maxlen=max_len, padding='post')
padded_sequences_train_2 = pad_sequences(sequences_train_2, maxlen=max_len, padding='post')

padded_sequences_test_1 = pad_sequences(sequences_test_1, maxlen=max_len, padding='post')
padded_sequences_test_2 = pad_sequences(sequences_test_2, maxlen=max_len, padding='post')

#print(padded_sequences_test_2)

50


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Assuming NLTK has been set up with the necessary data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Function to calculate syntactic features for a given text
def calculate_syntactic_features(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens, tagset='universal')  # Using universal tagset for simplicity

    # Count occurrences of POS tags
    pos_counts = Counter(tag for word, tag in tagged)

    # Calculate features
    features = {
        "number_of_words": len(tokens),
        "nouns": pos_counts['NOUN'],
        "verbs": pos_counts['VERB'],
        "first_person_singular": sum(1 for word, tag in tagged if word.lower() in ['i']),
        "second_person_singular": sum(1 for word, tag in tagged if word.lower() in ['you']),
        "second_person_plural": sum(1 for word, tag in tagged if word.lower() in ['you']),
        "third_person_singular": sum(1 for word, tag in tagged if word.lower() in ['he', 'she', 'it']),
        "third_person_plural": sum(1 for word, tag in tagged if word.lower() in ['they']),
        "first_person_plural": sum(1 for word, tag in tagged if word.lower() in ['we']),
        "modals": pos_counts['VERB'],  # Simplification, real modal counting might differ
        "modifiers": pos_counts['ADV'] + pos_counts['ADJ'],
        "lexical_diversity": len(set(tokens)) / len(tokens) if tokens else 0
    }

    return features

# Apply function to each text in dataset
features_matrix = [calculate_syntactic_features(text) for text in texts]

# Print features for demonstration
features_matrix


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/henrichevreux/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/henrichevreux/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/henrichevreux/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[{'number_of_words': 6,
  'nouns': 1,
  'verbs': 1,
  'first_person_singular': 0,
  'second_person_singular': 0,
  'second_person_plural': 0,
  'third_person_singular': 1,
  'third_person_plural': 0,
  'first_person_plural': 0,
  'modals': 1,
  'modifiers': 0,
  'lexical_diversity': 1.0},
 {'number_of_words': 25,
  'nouns': 6,
  'verbs': 4,
  'first_person_singular': 1,
  'second_person_singular': 0,
  'second_person_plural': 0,
  'third_person_singular': 0,
  'third_person_plural': 0,
  'first_person_plural': 0,
  'modals': 4,
  'modifiers': 3,
  'lexical_diversity': 0.96},
 {'number_of_words': 12,
  'nouns': 7,
  'verbs': 0,
  'first_person_singular': 0,
  'second_person_singular': 0,
  'second_person_plural': 0,
  'third_person_singular': 0,
  'third_person_plural': 0,
  'first_person_plural': 0,
  'modals': 0,
  'modifiers': 1,
  'lexical_diversity': 1.0},
 {'number_of_words': 8,
  'nouns': 4,
  'verbs': 1,
  'first_person_singular': 0,
  'second_person_singular': 0,
  'second_pers

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Convert list of dictionaries to DataFrame
features_df = pd.DataFrame(features_matrix)

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the feature data to scale it
scaled_features = scaler.fit_transform(features_df)

# `scaled_features` is now a NumPy array. If you need it back in DataFrame format:
scaled_features_df = pd.DataFrame(scaled_features, columns=features_df.columns)

# Now `scaled_features_df` can be used in your machine learning models


In [8]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
import numpy as np
# Load pre-trained GloVe embeddings (you need to download the GloVe file)
glove_embeddings_index = {}
with open('glove/glove.6B.300d.txt', encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = coefs

# Create an embedding matrix using GloVe for words in our tokenizer
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [19]:
# Create input layers
#input_shape = (50, 100)
input_shape = (max_len,)
input1 = Input(shape=input_shape)
input2 = Input(shape=input_shape)

syntactic_input = Input(shape=(scaled_features_df.shape[1],))

# Create Embedding layer
emb1 = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input1)
emb2 = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input2)

# Create LSTM layers
lstm1 = LSTM(units=128, return_sequences=True)(emb1)
lstm2 = LSTM(units=128, return_sequences=True)(emb2)

dropout1 = Dropout(0.2)(lstm1)
dropout2 = Dropout(0.2)(lstm2)

# Concatenate the outputs of both LSTM layers
concatenated = concatenate([dropout1, dropout2], axis=-1)

# Reduce dimensionality while preserving important features
pooled = GlobalMaxPooling1D()(concatenated)

# Concatenate pooled features with syntactic features
combined_features = concatenate([pooled, syntactic_input], axis=-1)

# Add a Dense layer for further processing
dense_layer = Dense(32, activation='relu')(combined_features)

# Add an output layer
output = Dense(3, activation='softmax')(dense_layer)

# Create the model
model = Model(inputs=[input1, input2, syntactic_input], outputs=output)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_22 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 input_23 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 embedding_12 (Embedding)    (None, 50, 300)              4146300   ['input_22[0][0]']            
                                                                                                  
 embedding_13 (Embedding)    (None, 50, 300)              4146300   ['input_23[0][0]']            
                                                                                            

In [20]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(
    monitor='val_accuracy', 
    mode='max', 
    min_delta=0.0001,
    patience=5,
    restore_best_weights=True,
)

history = model.fit(
    [padded_sequences_train_1, padded_sequences_train_2, scaled_features_df], 
    y=y_train_enc, 
    validation_split=0.1,
    batch_size=128,
    epochs=50,
    verbose=1,
    callbacks=[es]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


In [21]:
model.save('../saved_models/IBM_model_test_features')

INFO:tensorflow:Assets written to: ../saved_models/IBM_model_test_features/assets


INFO:tensorflow:Assets written to: ../saved_models/IBM_model_test_features/assets


In [21]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense, concatenate, Permute, Reshape, Dot, Activation, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# Define the P to C attention mechanism
def p_to_c_attention(child, parent):
    # Step 1: Dot product between parent and child sequences to get the attention scores
    attention_scores = Dot(axes=[2, 2])([parent, child])
    
    # Step 2: Apply softmax to get attention distribution
    attention_distribution = Activation('softmax')(attention_scores)
    
    # Step 3: Use the attention distribution to compute weighted sum of parent sequences
    weighted_sum = Dot(axes=[1, 1])([attention_distribution, parent])
    
    return weighted_sum

input_shape = (max_len,)
input1 = Input(shape=input_shape)
input2 = Input(shape=input_shape)

syntactic_input = Input(shape=(scaled_features_df.shape[1],))

# Create Embedding layer
emb1 = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False, mask_zero=True)(input1)
emb2 = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False, mask_zero=True)(input2)

# Create LSTM layers
lstm1 = LSTM(units=128, return_sequences=True)(emb1)
lstm2 = LSTM(units=128, return_sequences=True)(emb2)

# Batch Normalization layers
bn1 = BatchNormalization()(lstm1)
bn2 = BatchNormalization()(lstm2)

dropout1 = Dropout(0.5)(bn1)
dropout2 = Dropout(0.5)(bn2)

# Apply P to C attention
p_to_c_attended = p_to_c_attention(dropout1, dropout2)
c_to_p_attended = p_to_c_attention(dropout2, dropout1)

# Create LSTM layers
lstm3 = LSTM(units=128, return_sequences=True)(p_to_c_attended)
lstm4 = LSTM(units=128, return_sequences=True)(c_to_p_attended)

# Batch Normalization layers
bn3 = BatchNormalization()(lstm3)
bn4 = BatchNormalization()(lstm4)

dropout3 = Dropout(0.5)(bn3)
dropout4 = Dropout(0.5)(bn4)

# Optionally combine attended output with the original child sequence
combined_sequence = concatenate([dropout3, dropout4])

# Continue with model construction
pooled = GlobalMaxPooling1D()(combined_sequence)

# Concatenate pooled features with syntactic features
combined_features = concatenate([pooled, syntactic_input], axis=-1)

# Add a Dense layer for further processing
dense_layer = Dense(32, activation='relu')(combined_features)

# Add an output layer
output = Dense(3, activation='softmax')(dense_layer)

# Create the model
model = Model(inputs=[input1, input2, syntactic_input], outputs=output)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_29 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 input_28 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 embedding_19 (Embedding)    (None, 50, 300)              1742310   ['input_29[0][0]']            
                                                          0                                       
                                                                                                  
 embedding_18 (Embedding)    (None, 50, 300)              1742310   ['input_28[0][0]']      

In [22]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(
    monitor='val_accuracy', 
    mode='max', 
    min_delta=0.01,
    patience=2,
    restore_best_weights=True,
)

def step_decay(epoch):
    initial_lr = 0.001  # Starting learning rate
    drop = 0.9  # Learning rate will be reduced
    epochs_drop = 3.0  # Every 10 epochs
    lr = initial_lr * np.power(drop, np.floor((1+epoch)/epochs_drop))
    return lr

from tensorflow.keras.callbacks import LearningRateScheduler

lr_scheduler = LearningRateScheduler(step_decay)


history = model.fit(
    [padded_sequences_train_1, padded_sequences_train_2, scaled_features_df], 
    y=y_train_enc,
    validation_split=0.1,
    batch_size=128,
    epochs=20,
    verbose=1,
    callbacks=[es]
)

Epoch 1/20

W0000 00:00:1712245779.761999       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2600 num_cores: 12 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 12582912 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
W0000 00:00:1712245779.762089       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2600 num_cores: 12 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } enviro

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


INFO:tensorflow:Assets written to: ../saved_models/IBM_model_reg/assets


INFO:tensorflow:Assets written to: ../saved_models/IBM_model_reg/assets


In [None]:
model.save('../saved_models/IBM_model_reg')