In [1]:
import numpy as np
import tensorflow as tf

2024-02-08 15:40:14.409495: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load pre-trained model
new_model = tf.keras.models.load_model('saved_models/relationshipBuilder_model')

# Show the model architecture
new_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_6 (Embedding)     (None, 50, 100)              488500    ['input_7[0][0]']             
                                                                                                  
 embedding_7 (Embedding)     (None, 50, 100)              488500    ['input_8[0][0]']             
                                                                                            

In [2]:
# Import UKIP dataset
import pandas as pd
from IPython.display import display
from sklearn.utils import shuffle

df = pd.read_excel(r"../../datasets/ACMToIT2017_dataset.xlsx", sheet_name="Sheet1", header=None)

df = shuffle(df).reset_index(drop=True)

# Split features and labels
X = df[[3, 4]].T.reset_index(drop=True).T
y = df[[5]].T.reset_index(drop=True).T

display(X)
display(y)

Unnamed: 0,0,1
0,carbon is not the only method of dating used ...,to the currently voting yes that raises s...
1,we also investigate axis of evil aoe type anom...,while i am quite happy with pwning of the thr...
2,being an atheist in a catholic school i can t...,you have to consider the alternative explanat...
3,whats a darwinist can someone who says that th...,luskin quotesmay not the principle of uniformi...
4,should visited and joined be the same,no joined is the date a user registered to the...
...,...,...
4053,Only an idiot would think a UKIP win would res...,Clearly I would have preferred to get more vot...
4054,If UKIP is really the best our country can do ...,UKIP better represents the views of Conservati...
4055,"The country faces more than two big issues, wh...","UKIP MEP candidate Diane James, who came close..."
4056,I suspect when people do start to focus and th...,Labour has declined to say how it expects to d...


Unnamed: 0,0
0,s
1,s
2,s
3,s
4,a
...,...
4053,n
4054,a
4055,n
4056,n


In [3]:
# Split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Split pairs of sentences
sentences_train_1 = X_train[0].to_numpy()
sentences_train_2 = X_train[1].to_numpy()

sentences_test_1 = X_test[0].to_numpy()
sentences_test_2 = X_test[1].to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# prepare target
le = LabelEncoder()
le.fit(np.ravel(y))
y_train_enc = le.transform(np.ravel(y_train))
y_test_enc = le.transform(np.ravel(y_test))

# convert integers to dummy variables (i.e. one hot encoded)
y_train_enc = to_categorical(y_train_enc)
y_test_enc = to_categorical(y_test_enc)

print(y_train_enc, y_test_enc)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]] [[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Flatten features for Glove fitting
texts = np.concatenate([X[0], X[1]])

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
# Encode training data
sequences_train_1 = tokenizer.texts_to_sequences(sentences_train_1)
sequences_train_2 = tokenizer.texts_to_sequences(sentences_train_2)

# Encode testing data
sequences_test_1 = tokenizer.texts_to_sequences(sentences_test_1)
sequences_test_2 = tokenizer.texts_to_sequences(sentences_test_2)

# Padding sequences to have the same length
max_len = 50
print(max_len)
padded_sequences_train_1 = pad_sequences(sequences_train_1, maxlen=max_len, padding='post')
padded_sequences_train_2 = pad_sequences(sequences_train_2, maxlen=max_len, padding='post')

padded_sequences_test_1 = pad_sequences(sequences_test_1, maxlen=max_len, padding='post')
padded_sequences_test_2 = pad_sequences(sequences_test_2, maxlen=max_len, padding='post')

print(np.shape(padded_sequences_test_2))

50
(1218, 50)


In [6]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
# Load pre-trained GloVe embeddings (you need to download the GloVe file)
glove_embeddings_index = {}
with open('glove/glove.6B.100d.txt', encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = coefs

# Create an embedding matrix using GloVe for words in our tokenizer
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
results = new_model.evaluate([padded_sequences_test_1, padded_sequences_test_2], y_test_enc, batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.12855786085128784, 0.751098096370697]
