In [1]:
import numpy as np
import tensorflow as tf
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split


In [4]:

# ------------------------------------------------------
# 1) Sample Toy Dataset (20 sentences, 10 pos, 10 neg)
# ------------------------------------------------------
positive_sentences = [
    "i love this movie it is wonderful",
    "this film is amazing and i enjoyed it",
    "great plot and superb acting loved it",
    "highly recommended so good watch it",
    "excellent storyline i really liked it",
    "fantastic film i will watch again",
    "outstanding movie had a nice time",
    "lovely direction and wonderful script",
    "truly awesome film i recommend it",
    "incredible experience definitely positive"
]

negative_sentences = [
    "this movie is terrible i hate it",
    "boring film do not watch it again",
    "i dislike the story it was dull",
    "awful direction poor script overall",
    "this was a waste of time and money",
    "very bad film no redeeming features",
    "not recommended the worst movie ever",
    "horrible experience i left midway",
    "completely disappointing never again",
    "i really hated every part of this"
]

all_sentences = positive_sentences + negative_sentences
labels = np.array([1]*len(positive_sentences) + [0]*len(negative_sentences))  # 1=pos, 0=neg

# ------------------------------------------------------
# 2) Shuffle & Split the Data
# ------------------------------------------------------
# Combine text + labels, shuffle, then split
combined = list(zip(all_sentences, labels))
random.shuffle(combined)
shuffled_sentences, shuffled_labels = zip(*combined)

shuffled_sentences = np.array(shuffled_sentences)
shuffled_labels = np.array(shuffled_labels)

# Let's do an 80/20 split for train vs test
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    shuffled_sentences, shuffled_labels, 
    test_size=0.2, 
    random_state=42
)

# We'll further split out a validation set from the training data, 75/25
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_sentences, train_labels, 
    test_size=0.25,  # 0.25 of the training => 0.20 * 0.25 = total 5 test, 3 val, 12 train
    random_state=42
)

# ------------------------------------------------------
# 3) Tokenize & Pad Sequences
# ------------------------------------------------------
# Hyperparameters for tokenizing
vocab_size = 500
max_length = 10  # maximum words in a sequence

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(train_sentences)

def tokenize_and_pad(texts, tokenizer, max_len):
    seqs = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
    return padded

train_padded = tokenize_and_pad(train_sentences, tokenizer, max_length)
val_padded   = tokenize_and_pad(val_sentences,   tokenizer, max_length)
test_padded  = tokenize_and_pad(test_sentences,  tokenizer, max_length)

# ------------------------------------------------------
# 4) Build the LSTM Model
# ------------------------------------------------------
embedding_dim = 16
model = Sequential([
    Embedding(input_dim=vocab_size, 
              output_dim=embedding_dim, 
              input_length=max_length),
    LSTM(32),  # You can change the LSTM units to experiment
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ------------------------------------------------------
# 5) Train the Model
# ------------------------------------------------------
epochs = 10
history = model.fit(
    train_padded, 
    train_labels,
    validation_data=(val_padded, val_labels),
    epochs=epochs,
    batch_size=2,  # small batch_size for demonstration
    verbose=2
)

# ------------------------------------------------------
# 6) Evaluate on Test Data
# ------------------------------------------------------
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# ------------------------------------------------------
# 7) Make Predictions
# ------------------------------------------------------
new_texts = [
    "i absolutely loved the film so amazing",
    "it was a terrible movie i regret watching",
    "wonderful storytelling but the acting was dull"
]
new_padded = tokenize_and_pad(new_texts, tokenizer, max_length)

predictions = model.predict(new_padded)
print("\nPredictions on new sentences:")
for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred[0] > 0.5 else "Negative"
    print(f"Text: {text}")
    print(f"Predicted sentiment = {sentiment} (score: {pred[0]:.4f})\n")


Epoch 1/10
6/6 - 4s - 659ms/step - accuracy: 0.5000 - loss: 0.6977 - val_accuracy: 0.7500 - val_loss: 0.6799
Epoch 2/10
6/6 - 1s - 114ms/step - accuracy: 0.5000 - loss: 0.6924 - val_accuracy: 0.7500 - val_loss: 0.6824
Epoch 3/10
6/6 - 0s - 25ms/step - accuracy: 0.5000 - loss: 0.6918 - val_accuracy: 0.7500 - val_loss: 0.6848
Epoch 4/10
6/6 - 0s - 33ms/step - accuracy: 0.5000 - loss: 0.6901 - val_accuracy: 0.7500 - val_loss: 0.6869
Epoch 5/10
6/6 - 0s - 25ms/step - accuracy: 0.5833 - loss: 0.6874 - val_accuracy: 0.7500 - val_loss: 0.6849
Epoch 6/10
6/6 - 0s - 33ms/step - accuracy: 0.6667 - loss: 0.6841 - val_accuracy: 0.7500 - val_loss: 0.6870
Epoch 7/10
6/6 - 0s - 30ms/step - accuracy: 0.9167 - loss: 0.6764 - val_accuracy: 0.7500 - val_loss: 0.6843
Epoch 8/10
6/6 - 0s - 25ms/step - accuracy: 0.8333 - loss: 0.6655 - val_accuracy: 0.7500 - val_loss: 0.6878
Epoch 9/10
6/6 - 0s - 28ms/step - accuracy: 0.9167 - loss: 0.6369 - val_accuracy: 0.7500 - val_loss: 0.6900
Epoch 10/10
6/6 - 0s - 30m