In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import gensim.downloader as api

# Load pre-trained FastText embeddings from Gensim
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

embedding_dim = 300  # FastText embeddings have 300 dimensions



In [None]:
# Load IMDb dataset
vocab_size = 5000  # Limiting the vocabulary size to 20k most frequent words
max_length = 128    # Max length of input sequences

# Load the dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)

# Pad sequences to ensure uniform input length
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

# Create an embedding matrix for words in IMDb's vocabulary using FastText embeddings
embedding_matrix = np.zeros((vocab_size, embedding_dim))

word_index = tf.keras.datasets.imdb.get_word_index()

for word, index in word_index.items():
    if index < vocab_size:
        embedding_vector = None
        try:
            embedding_vector = fasttext_model.get_vector(word)
        except KeyError:
            pass  # Skip words not found in the FastText embeddings
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [37]:
# Create the embedding layer with the pre-trained FastText vectors
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)  # Set trainable=False to freeze the embeddings

 # Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu',),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [39]:
from sklearn.metrics import f1_score

# Model training
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc}')

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
f1 = f1_score(y_test, y_pred_classes)
print(f'F1 Score: {f1}')

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9727 - loss: 0.1012 - val_accuracy: 0.8754 - val_loss: 0.3096
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9913 - loss: 0.0497 - val_accuracy: 0.8757 - val_loss: 0.3364
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9962 - loss: 0.0269 - val_accuracy: 0.8743 - val_loss: 0.3661
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9992 - loss: 0.0125 - val_accuracy: 0.8757 - val_loss: 0.3856
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9998 - loss: 0.0061 - val_accuracy: 0.8755 - val_loss: 0.4194
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8765 - loss: 0.3106
Test Accuracy: 0.8753600120544434
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m