In [1]:
# Load the embeddings
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load our own Word2Vec embedding
own_w2v = KeyedVectors.load_word2vec_format("own_word2vec.model", binary=True)

In [3]:
# Load pre-trained Word2Vec embedding
google_w2v = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [4]:
glove_file = "glove.6B.300d.txt"
glove_embeddings = dict()

with open(glove_file, "r", encoding = "utf-8") as file:
    for line in tqdm(file, desc="Loading embeddings"):
        values = line.split()
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype=np.float32)
            glove_embeddings[word] = vector
        except ValueError:
            print("Could not convert values to float: ")
        glove_embeddings[word] = vector
        
glove_w2v = Word2Vec(
    sentences=None,  # Don't train on any text data
    vector_size=300,  # Set the embedding dimension
    min_count=1,  # Include all words in the dictionary
)
glove_w2v.wv.add_vectors(list(glove_embeddings.keys()), list(glove_embeddings.values()))

Loading embeddings: 400000it [00:24, 16637.98it/s]


In [5]:
# Load the dataset (e.g., IMDB movie reviews)
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=10000)

In [6]:
# Convert the word IDs to text
word_index = tf.keras.datasets.imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
x_train_text = [' '.join([reverse_word_index.get(i - 3, '?') for i in x]) for x in x_train]
x_test_text = [' '.join([reverse_word_index.get(i - 3, '?') for i in x]) for x in x_test]

In [7]:
# Tokenize the text and pad the sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train_text)
x_train_seq = tokenizer.texts_to_sequences(x_train_text)
x_test_seq = tokenizer.texts_to_sequences(x_test_text)
x_train_pad = pad_sequences(x_train_seq, maxlen=100)
x_test_pad = pad_sequences(x_test_seq, maxlen=100)

In [8]:
# Define own word2vec embedding layers
pretrained_own_emb_layer = Embedding(input_dim = own_w2v.vectors.shape[0], output_dim = 300, input_length = 100,
                                     weights = [own_w2v.vectors], trainable = False)
# Define google word2vec embedding layers
pretrained_google_emb_layer = Embedding(input_dim = google_w2v.vectors.shape[0], output_dim = 300, input_length = 100,
                                     weights = [google_w2v.vectors], trainable = False)
# Define glove word2vec embedding layers
pretrained_glove_emb_layer = Embedding(input_dim = glove_w2v.wv.vectors.shape[0], output_dim = 300, input_length = 100,
                                       weights = [glove_w2v.wv.vectors], trainable = False)

### Own Word2vec Model

In [9]:
# Define the neural network architecture
input_layer = Input(shape=(100,))
emb_layer = pretrained_own_emb_layer(input_layer)
lstm_layer = LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)(emb_layer)
dropout_layer = Dropout(rate=0.2)(lstm_layer)
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)

In [10]:
# Define the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 300)          4369500   
                                                                 
 lstm (LSTM)                 (None, 64)                93440     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 4,463,005
Trainable params: 93,505
Non-trainable params: 4,369,500
_________________________________________________________________
None


In [11]:
# Train the model
model.fit(x_train_pad, y_train, batch_size=128, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cd147b5cd0>

In [12]:
# Evaluate the model
y_pred = model.predict(x_test_pad) > 0.5
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.79084


### Google Word2vec Model

In [13]:
# Define the neural network architecture
input_layer = Input(shape=(100,))
emb_layer = pretrained_google_emb_layer(input_layer)
lstm_layer = LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)(emb_layer)
dropout_layer = Dropout(rate=0.2)(lstm_layer)
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)

In [14]:
# Define the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 300)          900000000 
                                                                 
 lstm_1 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 900,093,505
Trainable params: 93,505
Non-trainable params: 900,000,000
_________________________________________________________________
None


In [15]:
# Train the model
model.fit(x_train_pad, y_train, batch_size=128, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cd0efc19a0>

In [16]:
# Evaluate the model
y_pred = model.predict(x_test_pad) > 0.5
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.79248


### GloVe Word2vec Model

In [17]:
# Define the neural network architecture
input_layer = Input(shape=(100,))
emb_layer = pretrained_glove_emb_layer(input_layer)
lstm_layer = LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)(emb_layer)
dropout_layer = Dropout(rate=0.2)(lstm_layer)
output_layer = Dense(units=1, activation='sigmoid')(dropout_layer)

In [18]:
# Define the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 100, 300)          120000000 
                                                                 
 lstm_2 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 120,093,505
Trainable params: 93,505
Non-trainable params: 120,000,000
_________________________________________________________________
None


In [19]:
# Train the model
model.fit(x_train_pad, y_train, batch_size=128, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cd113ccee0>

In [20]:
# Evaluate the model
y_pred = model.predict(x_test_pad) > 0.5
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.81812
