In [14]:
import numpy as np
import tensorflow as tf
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split


In [37]:

# ------------------------------------------------------
# 1) Sample Toy Dataset (20 sentences, 10 pos, 10 neg)
# ------------------------------------------------------
positive_sentences = [
    "i love this movie it is wonderful",
    "this film is amazing and i enjoyed it",
    "great plot and superb acting loved it",
    "highly recommended so good watch it",
    "excellent storyline i really liked it",
    "fantastic film i will watch again",
    "outstanding movie had a nice time",
    "lovely direction and wonderful script",
    "truly awesome film i recommend it",
    "incredible experience definitely positive"
]

negative_sentences = [
    "this movie is terrible i hate it",
    "boring film do not watch it again",
    "i dislike the story it was dull",
    "awful direction poor script overall",
    "this was a waste of time and money",
    "very bad film no redeeming features",
    "not recommended the worst movie ever",
    "horrible experience i left midway",
    "completely disappointing never again",
    "i really hated every part of this"
]

all_sentences = positive_sentences + negative_sentences
labels = np.array([1]*len(positive_sentences) + [0]*len(negative_sentences))  # 1=pos, 0=neg


In [38]:

# ------------------------------------------------------
# 2) Shuffle & Split the Data
# ------------------------------------------------------
# Combine text + labels, shuffle, then split
combined = list(zip(all_sentences, labels))
random.shuffle(combined)
shuffled_sentences, shuffled_labels = zip(*combined)

shuffled_sentences = np.array(shuffled_sentences)
shuffled_labels = np.array(shuffled_labels)


In [39]:

# Let's do an 80/20 split for train vs test
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    shuffled_sentences, shuffled_labels, 
    test_size=0.2, 
    random_state=42
)

# We'll further split out a validation set from the training data, 75/25
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_sentences, train_labels, 
    test_size=0.25,  # 0.25 of the training => 0.20 * 0.25 = total 5 test, 3 val, 12 train
    random_state=42
)

In [31]:
for sent in train_sentences:
    if "terrible" in sent:
        print("Found 'terrible' in training:", sent)


Found 'terrible' in training: this movie is terrible i hate it


In [32]:


# ------------------------------------------------------
# 3) Tokenize & Pad Sequences
# ------------------------------------------------------
# Hyperparameters for tokenizing
vocab_size = 2000
max_length = 10  # maximum words in a sequence

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(train_sentences)

def tokenize_and_pad(texts, tokenizer, max_len):
    seqs = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
    return padded

train_padded = tokenize_and_pad(train_sentences, tokenizer, max_length)
val_padded   = tokenize_and_pad(val_sentences,   tokenizer, max_length)
test_padded  = tokenize_and_pad(test_sentences,  tokenizer, max_length)


In [33]:

# ------------------------------------------------------
# 4) Build the LSTM Model
# ------------------------------------------------------
embedding_dim = 16
model = Sequential([
    Embedding(input_dim=vocab_size, 
              output_dim=embedding_dim, 
              input_length=max_length),
    LSTM(64),  # You can change the LSTM units to experiment
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [34]:

# ------------------------------------------------------
# 5) Train the Model
# ------------------------------------------------------
epochs = 30
history = model.fit(
    train_padded, 
    train_labels,
    validation_data=(val_padded, val_labels),
    epochs=epochs,
    batch_size=2,  # small batch_size for demonstration
    verbose=2
)


Epoch 1/30
6/6 - 4s - 605ms/step - accuracy: 0.6667 - loss: 0.6886 - val_accuracy: 0.2500 - val_loss: 0.7119
Epoch 2/30
6/6 - 0s - 25ms/step - accuracy: 0.6667 - loss: 0.6807 - val_accuracy: 0.2500 - val_loss: 0.7257
Epoch 3/30
6/6 - 0s - 26ms/step - accuracy: 0.6667 - loss: 0.6686 - val_accuracy: 0.2500 - val_loss: 0.7370
Epoch 4/30
6/6 - 0s - 26ms/step - accuracy: 0.6667 - loss: 0.6627 - val_accuracy: 0.2500 - val_loss: 0.7586
Epoch 5/30
6/6 - 0s - 26ms/step - accuracy: 0.6667 - loss: 0.6430 - val_accuracy: 0.2500 - val_loss: 0.7744
Epoch 6/30
6/6 - 0s - 26ms/step - accuracy: 0.6667 - loss: 0.6399 - val_accuracy: 0.2500 - val_loss: 0.8217
Epoch 7/30
6/6 - 0s - 32ms/step - accuracy: 0.6667 - loss: 0.6064 - val_accuracy: 0.2500 - val_loss: 0.8692
Epoch 8/30
6/6 - 0s - 30ms/step - accuracy: 0.6667 - loss: 0.5706 - val_accuracy: 0.2500 - val_loss: 0.9110
Epoch 9/30
6/6 - 0s - 25ms/step - accuracy: 0.6667 - loss: 0.5493 - val_accuracy: 0.2500 - val_loss: 1.1066
Epoch 10/30
6/6 - 0s - 31ms

In [35]:

# ------------------------------------------------------
# 6) Evaluate on Test Data
# ------------------------------------------------------
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")



Test Loss: 2.7199
Test Accuracy: 0.5000


In [36]:

# ------------------------------------------------------
# 7) Make Predictions
# ------------------------------------------------------
new_texts = [
    "i absolutely loved the film so amazing",
    "it was a terrible movie i regret watching",
    "wonderful storytelling but the acting was dull"
]
new_padded = tokenize_and_pad(new_texts, tokenizer, max_length)

predictions = model.predict(new_padded)
print("\nPredictions on new sentences:")
for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred[0] > 0.5 else "Negative"
    print(f"Text: {text}")
    print(f"Predicted sentiment = {sentiment} (score: {pred[0]:.4f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step

Predictions on new sentences:
Text: i absolutely loved the film so amazing
Predicted sentiment = Positive (score: 0.9874)

Text: it was a terrible movie i regret watching
Predicted sentiment = Negative (score: 0.0121)

Text: wonderful storytelling but the acting was dull
Predicted sentiment = Positive (score: 0.9974)



In [25]:
print(tokenizer.word_index)


{'<UNK>': 1, 'i': 2, 'it': 3, 'this': 4, 'film': 5, 'and': 6, 'recommended': 7, 'the': 8, 'movie': 9, 'watch': 10, 'wonderful': 11, 'really': 12, 'is': 13, 'was': 14, 'of': 15, 'not': 16, 'worst': 17, 'ever': 18, 'truly': 19, 'awesome': 20, 'recommend': 21, 'highly': 22, 'so': 23, 'good': 24, 'fantastic': 25, 'will': 26, 'again': 27, 'lovely': 28, 'direction': 29, 'script': 30, 'excellent': 31, 'storyline': 32, 'liked': 33, 'love': 34, 'a': 35, 'waste': 36, 'time': 37, 'money': 38, 'dislike': 39, 'story': 40, 'dull': 41, 'horrible': 42, 'experience': 43, 'left': 44, 'midway': 45, 'hated': 46, 'every': 47, 'part': 48, 'amazing': 49, 'enjoyed': 50}
