In [2]:
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import tensorflow as tf
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split


In [5]:

# ------------------------------------------------------
# 1) Sample Toy Dataset (20 sentences, 10 pos, 10 neg)
# ------------------------------------------------------
positive_sentences = [
    "i love this movie it is wonderful",
    "this film is amazing and i enjoyed it",
    "great plot and superb acting loved it",
    "highly recommended so good watch it",
    "excellent storyline i really liked it",
    "fantastic film i will watch again",
    "outstanding movie had a nice time",
    "lovely direction and wonderful script",
    "truly awesome film i recommend it",
    "incredible experience definitely positive",
    
]

negative_sentences = [
    "this movie is terrible i hate it",
    "boring film do not watch it again",
    "i dislike the story it was dull",
    "awful direction poor script overall",
    "this was a waste of time and money",
    "very bad film no redeeming features",
    "not recommended the worst movie ever",
    "horrible experience i left midway",
    "completely disappointing never again",
    "i really hated every part of this"
]

all_sentences = positive_sentences + negative_sentences
print(all_sentences)
labels = np.array([1]*len(positive_sentences) + [0]*len(negative_sentences))  # 1=pos, 0=neg
print(labels)


['i love this movie it is wonderful', 'this film is amazing and i enjoyed it', 'great plot and superb acting loved it', 'highly recommended so good watch it', 'excellent storyline i really liked it', 'fantastic film i will watch again', 'outstanding movie had a nice time', 'lovely direction and wonderful script', 'truly awesome film i recommend it', 'incredible experience definitely positive', 'this movie is terrible i hate it', 'boring film do not watch it again', 'i dislike the story it was dull', 'awful direction poor script overall', 'this was a waste of time and money', 'very bad film no redeeming features', 'not recommended the worst movie ever', 'horrible experience i left midway', 'completely disappointing never again', 'i really hated every part of this']
[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]


In [10]:

# ------------------------------------------------------
# 2) Shuffle & Split the Data
# ------------------------------------------------------
# Combine text + labels, shuffle, then split
combined = list(zip(all_sentences, labels))

print(combined)
random.shuffle(combined)
print(combined)
shuffled_sentences, shuffled_labels = zip(*combined)
print(shuffled_sentences)
print(shuffled_labels)

shuffled_sentences = np.array(shuffled_sentences)
shuffled_labels = np.array(shuffled_labels)
print(shuffled_sentences)
print(shuffled_labels)


[('i love this movie it is wonderful', np.int64(1)), ('this film is amazing and i enjoyed it', np.int64(1)), ('great plot and superb acting loved it', np.int64(1)), ('highly recommended so good watch it', np.int64(1)), ('excellent storyline i really liked it', np.int64(1)), ('fantastic film i will watch again', np.int64(1)), ('outstanding movie had a nice time', np.int64(1)), ('lovely direction and wonderful script', np.int64(1)), ('truly awesome film i recommend it', np.int64(1)), ('incredible experience definitely positive', np.int64(1)), ('this movie is terrible i hate it', np.int64(0)), ('boring film do not watch it again', np.int64(0)), ('i dislike the story it was dull', np.int64(0)), ('awful direction poor script overall', np.int64(0)), ('this was a waste of time and money', np.int64(0)), ('very bad film no redeeming features', np.int64(0)), ('not recommended the worst movie ever', np.int64(0)), ('horrible experience i left midway', np.int64(0)), ('completely disappointing never

In [17]:

# Let's do an 80/20 split for train vs test
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    shuffled_sentences, shuffled_labels, 
    test_size=0.2, 
    random_state=42
)



In [18]:
train_sentences

array(['excellent storyline i really liked it',
       'boring film do not watch it again',
       'this was a waste of time and money',
       'completely disappointing never again',
       'highly recommended so good watch it',
       'very bad film no redeeming features',
       'outstanding movie had a nice time',
       'great plot and superb acting loved it',
       'horrible experience i left midway',
       'incredible experience definitely positive',
       'truly awesome film i recommend it',
       'lovely direction and wonderful script',
       'i dislike the story it was dull',
       'awful direction poor script overall',
       'this film is amazing and i enjoyed it',
       'fantastic film i will watch again'], dtype='<U41')

In [19]:
test_sentences

array(['i love this movie it is wonderful',
       'this movie is terrible i hate it',
       'i really hated every part of this',
       'not recommended the worst movie ever'], dtype='<U41')

In [20]:
# We'll further split out a validation set from the training data, 75/25
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_sentences, train_labels, 
    test_size=0.25,  # 0.25 of the training => 0.20 * 0.25 = total 5 test, 3 val, 12 train
    random_state=42
)

In [21]:
train_sentences

array(['awful direction poor script overall',
       'lovely direction and wonderful script',
       'horrible experience i left midway',
       'incredible experience definitely positive',
       'this was a waste of time and money',
       'fantastic film i will watch again',
       'highly recommended so good watch it',
       'great plot and superb acting loved it',
       'truly awesome film i recommend it',
       'i dislike the story it was dull',
       'completely disappointing never again',
       'outstanding movie had a nice time'], dtype='<U41')

In [22]:
val_sentences

array(['excellent storyline i really liked it',
       'boring film do not watch it again',
       'very bad film no redeeming features',
       'this film is amazing and i enjoyed it'], dtype='<U41')

In [25]:
for sent in train_sentences:
    if "fantastic" in sent:
        print("Found 'fantastic ' in training:", sent)


Found 'fantastic ' in training: fantastic film i will watch again


In [34]:


# ------------------------------------------------------
# 3) Tokenize & Pad Sequences
# ------------------------------------------------------
# Hyperparameters for tokenizing
vocab_size = 5000
max_length = 10  # maximum words in a sequence

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<UNK>")
tokenizer.fit_on_texts(train_sentences)

print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(["this is a test sentence"]))
# ------------------------------------------------------
# 4) Pad Sequences
# ------------------------------------------------------

def tokenize_and_pad(texts, tokenizer, max_len):
    seqs = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
    return padded

train_padded = tokenize_and_pad(train_sentences, tokenizer, max_length)
val_padded   = tokenize_and_pad(val_sentences,   tokenizer, max_length)
test_padded  = tokenize_and_pad(test_sentences,  tokenizer, max_length)


{'<UNK>': 1, 'i': 2, 'it': 3, 'and': 4, 'direction': 5, 'script': 6, 'experience': 7, 'was': 8, 'a': 9, 'time': 10, 'film': 11, 'watch': 12, 'again': 13, 'awful': 14, 'poor': 15, 'overall': 16, 'lovely': 17, 'wonderful': 18, 'horrible': 19, 'left': 20, 'midway': 21, 'incredible': 22, 'definitely': 23, 'positive': 24, 'this': 25, 'waste': 26, 'of': 27, 'money': 28, 'fantastic': 29, 'will': 30, 'highly': 31, 'recommended': 32, 'so': 33, 'good': 34, 'great': 35, 'plot': 36, 'superb': 37, 'acting': 38, 'loved': 39, 'truly': 40, 'awesome': 41, 'recommend': 42, 'dislike': 43, 'the': 44, 'story': 45, 'dull': 46, 'completely': 47, 'disappointing': 48, 'never': 49, 'outstanding': 50, 'movie': 51, 'had': 52, 'nice': 53}
[[25, 1, 9, 1, 1]]


In [35]:

# ------------------------------------------------------
# 4) Build the LSTM Model
# ------------------------------------------------------
embedding_dim = 16
model = Sequential([
    Embedding(input_dim=vocab_size, 
              output_dim=embedding_dim, 
              input_length=max_length),
    LSTM(64),  # You can change the LSTM units to experiment
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [36]:

# ------------------------------------------------------
# 5) Train the Model
# ------------------------------------------------------
epochs = 30
history = model.fit(
    train_padded, 
    train_labels,
    validation_data=(val_padded, val_labels),
    epochs=epochs,
    batch_size=2,  # small batch_size for demonstration
    verbose=2
)


Epoch 1/30
6/6 - 8s - 1s/step - accuracy: 0.4167 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6937
Epoch 2/30
6/6 - 0s - 36ms/step - accuracy: 0.5833 - loss: 0.6913 - val_accuracy: 0.5000 - val_loss: 0.6945
Epoch 3/30
6/6 - 0s - 56ms/step - accuracy: 0.5833 - loss: 0.6866 - val_accuracy: 0.5000 - val_loss: 0.6951
Epoch 4/30
6/6 - 0s - 30ms/step - accuracy: 0.5833 - loss: 0.6849 - val_accuracy: 0.5000 - val_loss: 0.6968
Epoch 5/30
6/6 - 0s - 34ms/step - accuracy: 0.5833 - loss: 0.6778 - val_accuracy: 0.5000 - val_loss: 0.6995
Epoch 6/30
6/6 - 0s - 40ms/step - accuracy: 0.5833 - loss: 0.6687 - val_accuracy: 0.5000 - val_loss: 0.7031
Epoch 7/30
6/6 - 0s - 42ms/step - accuracy: 0.5833 - loss: 0.6566 - val_accuracy: 0.5000 - val_loss: 0.7160
Epoch 8/30
6/6 - 0s - 38ms/step - accuracy: 0.5833 - loss: 0.6279 - val_accuracy: 0.5000 - val_loss: 0.7520
Epoch 9/30
6/6 - 0s - 38ms/step - accuracy: 0.5833 - loss: 0.5477 - val_accuracy: 0.5000 - val_loss: 0.8638
Epoch 10/30
6/6 - 0s - 35ms/st

In [37]:

# ------------------------------------------------------
# 6) Evaluate on Test Data
# ------------------------------------------------------
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")



Test Loss: 1.5233
Test Accuracy: 0.5000


In [38]:

# ------------------------------------------------------
# 7) Make Predictions
# ------------------------------------------------------
new_texts = [
    "i absolutely loved the film so amazing",
    "it was a terrible movie i regret watching",
    "wonderful storytelling but the acting was dull"
]
new_padded = tokenize_and_pad(new_texts, tokenizer, max_length)

predictions = model.predict(new_padded)
print("\nPredictions on new sentences:")
for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred[0] > 0.5 else "Negative"
    print(f"Text: {text}")
    print(f"Predicted sentiment = {sentiment} (score: {pred[0]:.4f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step

Predictions on new sentences:
Text: i absolutely loved the film so amazing
Predicted sentiment = Negative (score: 0.0961)

Text: it was a terrible movie i regret watching
Predicted sentiment = Negative (score: 0.0042)

Text: wonderful storytelling but the acting was dull
Predicted sentiment = Positive (score: 0.9984)



In [25]:
print(tokenizer.word_index)


{'<UNK>': 1, 'i': 2, 'it': 3, 'this': 4, 'film': 5, 'and': 6, 'recommended': 7, 'the': 8, 'movie': 9, 'watch': 10, 'wonderful': 11, 'really': 12, 'is': 13, 'was': 14, 'of': 15, 'not': 16, 'worst': 17, 'ever': 18, 'truly': 19, 'awesome': 20, 'recommend': 21, 'highly': 22, 'so': 23, 'good': 24, 'fantastic': 25, 'will': 26, 'again': 27, 'lovely': 28, 'direction': 29, 'script': 30, 'excellent': 31, 'storyline': 32, 'liked': 33, 'love': 34, 'a': 35, 'waste': 36, 'time': 37, 'money': 38, 'dislike': 39, 'story': 40, 'dull': 41, 'horrible': 42, 'experience': 43, 'left': 44, 'midway': 45, 'hated': 46, 'every': 47, 'part': 48, 'amazing': 49, 'enjoyed': 50}
