# steps

- preprocess the text data by tokenizing and padding sequences
- use a pre-trained model for sentiment analysis or train a model from scratch using tensorflow
- train the model and monitor the performance using accuracy, precision, recall, and f1-score
- evaluate and visualize the model performance using confusion matrix

In [None]:
%pip install tensorflow-datasets

#  using pre-trained model

In [None]:
from transformers import pipeline

sentiment_analysis = pipeline("sentiment-analysis")

texts = ["i loved this movie", "this movie was terrible", "this movie was the worst movie i have ever seen"]

results = sentiment_analysis(texts)

for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}")
    print(f"Confidence: {result['score']}")
    print("-------------------------")

# using custom model

In [26]:
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping



In [5]:
# load the imdb reviews dataset

(train_data, test_data), info = tfds.load("imdb_reviews", split=["train", "test"], with_info=True, as_supervised=True)

2025-01-09 13:14:37.418887: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/hi/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


  from .autonotebook import tqdm as notebook_tqdm
Dl Size...: 100%|██████████| 80/80 [01:32<00:00,  1.15s/ MiB]rl]
Dl Completed...: 100%|██████████| 1/1 [01:32<00:00, 92.28s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to /Users/hi/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [12]:
# Function to preprocess the text
def preprocess_text(text, label):
    text = tf.strings.lower(text)
    return text, label

# Apply the preprocessing function to the dataset
train_data = train_data.map(preprocess_text)
test_data = test_data.map(preprocess_text)

# Collect the texts and labels
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(train_data):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)

2025-01-09 13:24:50.206555: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [28]:
# Tokenize the texts
 
tokenizer = Tokenizer(num_words=1000, oov_token="<00V>")
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)

# Pad the sequences

padded_sequences = pad_sequences(sequences, maxlen=100)

assert len(padded_sequences) == len(train_labels), "Mismatch in number of samples between padded_sequences and train_labels"
print("len of padded_sequences: ", len(padded_sequences))
print("len of train_labels: ", len(train_labels))

# Convert train_labels to a numpy array
train_labels = np.array(train_labels)



len of padded_sequences:  25000
len of train_labels:  25000


In [29]:
# Define the model

model = Sequential([
    Embedding(input_dim=1000, output_dim=16, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping callback

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model with early stopping

history = model.fit(padded_sequences, train_labels, epochs=10, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/10




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.6896 - loss: 0.5637
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8297 - loss: 0.3834
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8370 - loss: 0.3626
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8419 - loss: 0.3556
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8487 - loss: 0.3418
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8540 - loss: 0.3314
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8597 - loss: 0.3211
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8657 - loss: 0.3070
Epoch 9/10
[1m782/782[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x16a030fd0>

In [30]:
# Evaluate the model

test_texts = []
test_labels = []
for text, label in tfds.as_numpy(test_data):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)

test_sequences = tokenizer.texts_to_sequences(test_texts)
padded_test_sequences = pad_sequences(test_sequences, maxlen=100)
test_labels = np.array(test_labels)

test_loss, test_accuracy = model.evaluate(padded_test_sequences, test_labels)
print(f'Test loss: {test_loss}, test accuracy: {test_accuracy}')

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8331 - loss: 0.4052
Test loss: 0.400058776140213, test accuracy: 0.8332399725914001


In [36]:
# Function to preprocess and predict custom query

def predict_custom_query(query):
    query = query.lower()
    query_sequence = tokenizer.texts_to_sequences([query])
    padded_query_sequence = pad_sequences(query_sequence, maxlen=100)
    
    prediction = model.predict(padded_query_sequence)
    return prediction[0][0]


custom_query = "This movie was fantastic! I really enjoyed it."
prediction = predict_custom_query(custom_query)
print(f'Custom query: "{custom_query}"')
print(f"sentiment: {'POSITIVE' if prediction > 0.5 else 'NEGATIVE'}")
print(f'Prediction (0 = negative, 1 = positive): {prediction:.3f}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Custom query: "This movie was fantastic! I really enjoyed it."
sentiment: POSITIVE
Prediction (0 = negative, 1 = positive): 0.933
