In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
import tensorflow as tf

# Suppressing deprecation warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Function to load data
def load_data(directory):
    texts = []
    labels = []
    
    for label in ['neg', 'pos']:
        label_dir = os.path.join(directory, label)
        for filename in os.listdir(label_dir):
            file_path = os.path.join(label_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                texts.append(text)
                labels.append(0 if label == 'neg' else 1)  # 0 for 'neg', 1 for 'pos'
                
    return texts, labels

# Load data
train_texts, train_labels = load_data('aclimdb_v1/aclimdb/train')

# Tokenization and padding
max_words = 5000
maxlen = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

x_train = tokenizer.texts_to_sequences(train_texts)
x_train = pad_sequences(x_train, maxlen=maxlen)
y_train = np.array(train_labels)

# Splitting the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Hyperparameters
embedding_dim = 64
hidden_units = 64
dropout_rate = 0.4
l2_lambda = 0.01  # L2 regularization parameter

# Model architecture
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(hidden_units, activation='relu', kernel_regularizer=regularizers.l2(l2_lambda)))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(l2_lambda)))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
print(model.summary())

# Training
epochs = 6
batch_size = 32
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val))

# Evaluating: validation data
val_loss, val_accuracy = model.evaluate(x_val, y_val, batch_size=batch_size)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

# Evaluating: test data
test_texts, test_labels = load_data('aclimdb_v1/aclimdb/test')
x_test = tokenizer.texts_to_sequences(test_texts)
x_test = pad_sequences(x_test, maxlen=maxlen)
y_test = np.array(test_labels)

test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=batch_size)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# Plotting accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

# Plotting loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()