In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import gensim.downloader as api

# Load pre-trained FastText embeddings from Gensim
fasttext_model = api.load("word2vec-google-news-300")

embedding_dim = 300  # FastText embeddings have 300 dimensions



In [3]:
# Load IMDb dataset
vocab_size = 10000  # Limiting the vocabulary size to 20k most frequent words
max_length = 200    # Max length of input sequences

# Load the dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)

# Pad sequences to ensure uniform input length
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

# Create an embedding matrix for words in IMDb's vocabulary using FastText embeddings
embedding_matrix = np.zeros((vocab_size, embedding_dim))

word_index = tf.keras.datasets.imdb.get_word_index()

for word, index in word_index.items():
    if index < vocab_size:
        embedding_vector = None
        try:
            embedding_vector = fasttext_model.get_vector(word)
        except KeyError:
            pass  # Skip words not found in the FastText embeddings
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# Create the embedding layer with the pre-trained FastText vectors
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)  # Set trainable=False to freeze the embeddings

 # Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu',),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [5]:
from sklearn.metrics import f1_score

# Model training
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc}')

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
f1 = f1_score(y_test, y_pred_classes)
print(f'F1 Score: {f1}')

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 260ms/step - accuracy: 0.5621 - loss: 0.6771 - val_accuracy: 0.7824 - val_loss: 0.4878
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 312ms/step - accuracy: 0.8127 - loss: 0.4333 - val_accuracy: 0.8593 - val_loss: 0.3320
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 258ms/step - accuracy: 0.8872 - loss: 0.2826 - val_accuracy: 0.8812 - val_loss: 0.2827
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 260ms/step - accuracy: 0.9291 - loss: 0.1986 - val_accuracy: 0.8888 - val_loss: 0.2661
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 240ms/step - accuracy: 0.9561 - loss: 0.1372 - val_accuracy: 0.8919 - val_loss: 0.2646
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 209ms/step - accuracy: 0.9772 - loss: 0.0869 - val_accuracy: 0.8926 - val_loss: 0.2731
Epoch