# Part A

In [2]:
#1. Import and analyse the data set.

# Import required libraries and load the dataset
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb

# Load the dataset and keep the 10,000 most frequent words
vocabulary_size = 10000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocabulary_size)

# Check the shape of the data
print(f"Training data shape: {train_data.shape}")
print(f"Training labels shape: {train_labels.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Test labels shape: {test_labels.shape}")

# Print a sample from the dataset
print(f"Sample review (encoded): {train_data[0]}")
print(f"Sample label: {train_labels[0]}")

Training data shape: (25000,)
Training labels shape: (25000,)
Test data shape: (25000,)
Test labels shape: (25000,)
Sample review (encoded): [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 

In [3]:
# 2. Perform relevant sequence adding on the data.
# Perform sequence padding to make all reviews the same length

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to ensure each review is exactly 20 words long
max_length = 20
train_data_padded = pad_sequences(train_data, maxlen=max_length)
test_data_padded = pad_sequences(test_data, maxlen=max_length)

# Check the shape after padding
print(f"Padded training data shape: {train_data_padded.shape}")
print(f"Padded test data shape: {test_data_padded.shape}")

# Print a sample after padding
print(f"Sample review (padded): {train_data_padded[0]}")

Padded training data shape: (25000, 20)
Padded test data shape: (25000, 20)
Sample review (padded): [  65   16   38 1334   88   12   16  283    5   16 4472  113  103   32
   15   16 5345   19  178   32]


In [4]:
#3. Perform following data analysis.
#3A. Print shape of features and labels
#3B. Print value of any one feature and it's label

# Print the shape of the padded features (reviews) and labels (sentiments)
print(f"Padded training data shape: {train_data_padded.shape}")
print(f"Padded training labels shape: {train_labels.shape}")

# Print one sample feature (padded review) and its corresponding label
sample_index = 0  # You can change this index to see different samples
print(f"Sample review (padded): {train_data_padded[sample_index]}")
print(f"Corresponding label: {train_labels[sample_index]}")  # 0 for negative, 1 for positive

Padded training data shape: (25000, 20)
Padded training labels shape: (25000,)
Sample review (padded): [  65   16   38 1334   88   12   16  283    5   16 4472  113  103   32
   15   16 5345   19  178   32]
Corresponding label: 1


In [5]:
#4. Decode the feature value to get original sentence

# Get the word index from the IMDB dataset
word_index = imdb.get_word_index()

# The word index is 1-based, but we need it to be 0-based for decoding
reverse_word_index = {value: key for (key, value) in word_index.items()}

# The IMDB dataset adds a few reserved indices
reverse_word_index = {k+3: v for k, v in reverse_word_index.items()}
reverse_word_index[0] = "<PAD>"  # Padding
reverse_word_index[1] = "<START>"  # Start of a review
reverse_word_index[2] = "<UNK>"  # Unknown word
reverse_word_index[3] = "<UNUSED>"  # Unused word

# Decode a sample review from padded data back to words
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i, "?") for i in encoded_review])

# Choose any sample review to decode (for example, the first one)
sample_review = train_data_padded[0]
print(f"Decoded review: {decode_review(sample_review)}")

Decoded review: story was so lovely because it was true and was someone's life after all that was shared with us all


In [6]:
#5. Design, train, tune and test a sequential model.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM

# Define the model
model = Sequential()

# Add an embedding layer (vocabulary size is 10,000 and output dimension is 32)
model.add(Embedding(input_dim=vocabulary_size, output_dim=32, input_length=max_length))

# Add an LSTM layer
model.add(LSTM(32))

# Add a dense layer with 1 output (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_data_padded, train_labels, epochs=10, batch_size=512, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_acc = model.evaluate(test_data_padded, test_labels)
print(f"Test Accuracy: {test_acc}")

Epoch 1/10




[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.5840 - loss: 0.6860 - val_accuracy: 0.6766 - val_loss: 0.5953
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7225 - loss: 0.5473 - val_accuracy: 0.7376 - val_loss: 0.5191
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.8038 - loss: 0.4275 - val_accuracy: 0.7502 - val_loss: 0.4974
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.8402 - loss: 0.3670 - val_accuracy: 0.7472 - val_loss: 0.5193
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.8631 - loss: 0.3264 - val_accuracy: 0.7454 - val_loss: 0.5387
Epoch 6/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8834 - loss: 0.2950 - val_accuracy: 0.7394 - val_loss: 0.5825
Epoch 7/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━

In [7]:
#6. Use the designed model to print the prediction on any one sample.

# Select a sample review from the test data (for example, the first test review)
sample_review = test_data_padded[5]

# Predict the sentiment
prediction = model.predict(np.array([sample_review]))

# Print the result
print(f"Predicted sentiment: {'positive' if prediction[0] > 0.5 else 'negative'}")
print(f"Actual sentiment: {'positive' if test_labels[0] == 1 else 'negative'}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494ms/step
Predicted sentiment: negative
Actual sentiment: negative


### Further experimentations to improve the accuracy

In [9]:
# Increase the LSTM units from 32 to 64 for better learning capacity.
# Add a Dropout layer to prevent overfitting with a 50% dropout rate.
# Increase the batch size to 256 and train for 15 epochs for better generalization.
                                                        
from tensorflow.keras.layers import Dropout

# Step: Experimenting with larger LSTM units and adding Dropout

# Define the model with modifications
model = Sequential()

# Add an embedding layer
model.add(Embedding(input_dim=vocabulary_size, output_dim=64, input_length=max_length))

# Add an LSTM layer with more units and dropout
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))

# Add a dense layer with sigmoid for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with the same optimizer and loss function
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model again with new parameters
history = model.fit(train_data_padded, train_labels, epochs=15, batch_size=256, validation_split=0.2)

# Evaluate the model again
test_loss, test_acc = model.evaluate(test_data_padded, test_labels)
print(f"Test Accuracy: {test_acc}")

Epoch 1/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 55ms/step - accuracy: 0.6004 - loss: 0.6579 - val_accuracy: 0.7362 - val_loss: 0.5206
Epoch 2/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - accuracy: 0.8040 - loss: 0.4292 - val_accuracy: 0.7486 - val_loss: 0.4940
Epoch 3/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.8541 - loss: 0.3511 - val_accuracy: 0.7460 - val_loss: 0.5336
Epoch 4/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.8830 - loss: 0.2990 - val_accuracy: 0.7376 - val_loss: 0.5606
Epoch 5/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.9033 - loss: 0.2434 - val_accuracy: 0.7332 - val_loss: 0.6169
Epoch 6/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.9294 - loss: 0.1968 - val_accuracy: 0.7316 - val_loss: 0.6691
Epoch 7/15
[1m79/79[0m [32m━━━━