In [1]:
import numpy as np
import tensorflow as tf

2024-02-06 14:27:56.566736: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load pre-trained model
new_model = tf.keras.models.load_model('saved_models/relationshipBuilder_model')

# Show the model architecture
new_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_6 (Embedding)     (None, 50, 100)              488500    ['input_7[0][0]']             
                                                                                                  
 embedding_7 (Embedding)     (None, 50, 100)              488500    ['input_8[0][0]']             
                                                                                            

In [4]:
# Import UKIP dataset
import pandas as pd
from IPython.display import display
from sklearn.utils import shuffle

df = pd.read_excel(r"../../datasets/UKIP.xlsx", sheet_name="Feuil1", header=None)

df = shuffle(df).reset_index(drop=True)

# Split features and labels
X = df[[3, 4]].T.reset_index(drop=True).T
y = df[[5]].T.reset_index(drop=True).T

display(X)
display(y)

Unnamed: 0,0,1
0,If anyone seriously thinks that UKIP will offe...,He claimed the document was nothing more a col...
1,The reason why UKIP has gained so many votes i...,If I was confident that Labour were going to d...
2,UKIP are not rabid racist and not another inca...,He also suggested that his party might get its...
3,"The problem with popularism but no substance, ...","It is fielding 1,217 candidates this time - a ..."
4,Beyond leaving the EU and migration what ukip ...,Mr Hamilton will sit on UKIP's national execut...
...,...,...
2269,Despite their best efforts at pretending other...,He made the comment during a debate on EU prop...
2270,"The problem with popularism but no substance, ...",Conservative party vice chairman Michael Fabri...
2271,If the other parties do tackle the issues of E...,"Timo Soini, leader of the Eurosceptic True Fin..."
2272,"Even a UKIP European candidate, Janice Atkinso...","Speaking on The World At One Nigel Farage, lea..."


Unnamed: 0,0
0,n
1,n
2,s
3,n
4,n
...,...
2269,n
2270,n
2271,n
2272,a


In [5]:
# Split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Split pairs of sentences
sentences_train_1 = X_train[0].to_numpy()
sentences_train_2 = X_train[1].to_numpy()

sentences_test_1 = X_test[0].to_numpy()
sentences_test_2 = X_test[1].to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

(1591, 2)
["In contrast, the Liberal Democrat's Bill Newton Dunn quit the Tories in protest at its increasing hostility to Europe. And now?"
 'The group suggest that within five years UKIP will have more members than the Tories at the current rate of attrition.'
 'Afterwards, Mr Nattrass began discussions with the English Democrats, who campaign for the establishment of an English parliament and immediate withdrawal from the EU, about a possible move.'
 ...
 'I think of the volunteers in my own constituency, they are not just my friends and my supporters, I feel I am one of them.'
 'In fact, Downing Street have already briefed that David Cameron was actually too busy running the country he said.'
 'Farage told the BBC the Conservatives were virtually indistinguishable from Labour and the Lib Dems on many issues and that UKIP was offering a real alternative.'] (1591,)


In [6]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# prepare target
le = LabelEncoder()
le.fit(np.ravel(y))
y_train_enc = le.transform(np.ravel(y_train))
y_test_enc = le.transform(np.ravel(y_test))

# convert integers to dummy variables (i.e. one hot encoded)
y_train_enc = to_categorical(y_train_enc)
y_test_enc = to_categorical(y_test_enc)

print(y_train_enc, y_test_enc)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] [[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Flatten features for Glove fitting
texts = np.concatenate([X[0], X[1]])

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
# Encode training data
sequences_train_1 = tokenizer.texts_to_sequences(sentences_train_1)
sequences_train_2 = tokenizer.texts_to_sequences(sentences_train_2)

# Encode testing data
sequences_test_1 = tokenizer.texts_to_sequences(sentences_test_1)
sequences_test_2 = tokenizer.texts_to_sequences(sentences_test_2)

# Padding sequences to have the same length
max_len = 50
print(max_len)
padded_sequences_train_1 = pad_sequences(sequences_train_1, maxlen=max_len, padding='post')
padded_sequences_train_2 = pad_sequences(sequences_train_2, maxlen=max_len, padding='post')

padded_sequences_test_1 = pad_sequences(sequences_test_1, maxlen=max_len, padding='post')
padded_sequences_test_2 = pad_sequences(sequences_test_2, maxlen=max_len, padding='post')

print(np.shape(padded_sequences_test_2))

['If anyone seriously thinks that UKIP will offer some kind of answer to our current problems then I think that they are going to be sadly disappointed.'
 'The reason why UKIP has gained so many votes is because the main parties refuse to deal with immigration and the EU.'
 'UKIP are not rabid racist and not another incarnation of the BNP.' ...
 "Timo Soini, leader of the Eurosceptic True Finns Party, which gained 19% of the vote at Finland's general election in April, will also address delegates."
 'Speaking on The World At One Nigel Farage, leader of the UKIP party, told presenter Shaun Ley that he believes that UKIP are offering a real alternative, and leaving the EU is at the heart of it.'
 'The penny is beginning to drop with the British people and British businesses that we are no longer a self-governing nation, and UKIP are here to remedy this.']
50
(683, 50)


In [13]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
# Load pre-trained GloVe embeddings (you need to download the GloVe file)
glove_embeddings_index = {}
with open('glove/glove.6B.100d.txt', encoding='utf-8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = coefs

# Create an embedding matrix using GloVe for words in our tokenizer
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
results = new_model.evaluate([padded_sequences_test_1, padded_sequences_test_2], y_test_enc, batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.12855786085128784, 0.751098096370697]


In [15]:
test = new_model.predict([padded_sequences_test_1, padded_sequences_test_2])

label_map = {
    '1': 'n',
    '0': 'a',
    '2': 's',
}

for t in range(len(test)):
    arg = str(np.argmax(test[t]))
    arg_dec = label_map.get(arg)
    



In [None]:
#export_data = df[[3, 4, 5]].T.reset_index(drop=True).T
#export_data.to_csv('UKIP_out.csv')

In [None]:
print()