<a href="https://colab.research.google.com/github/harssh15/Generative-ai-projects/blob/main/Project_on_Sentiment_analysis_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split


In [6]:
df = pd.read_csv("/content/drive/MyDrive/data/1111.csv")

df.head()



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

In [None]:
print(df['sentiment'].value_counts())

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:
df['sentiment'] = df['sentiment'].map({
    'positive': 1,
    'negative': 0
})


In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
X = df['review'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

array(['That\'s what I kept asking myself during the many fights, screaming matches, swearing and general mayhem that permeate the 84 minutes. The comparisons also stand up when you think of the one-dimensional characters, who have so little depth that it is virtually impossible to care what happens to them. They are just badly written cyphers for the director to hang his multicultural beliefs on, a topic that has been done much better in other dramas both on TV and the cinema.<br /><br />I must confess, I\'m not really one for spotting bad performances during a film, but it must be said that Nichola Burley (as the heroine\'s slutty best friend) and Wasim Zakir (as the nasty, bullying brother) were absolutely terrible. I don\'t know what acting school they graduated from, but if I was them I\'d apply for a full refund post haste. Only Samina Awan in the lead role manages to impress in a cast of so-called British talent that we\'ll probably never hear from again. At least, that\'s the h

In [None]:
y_train, y_test

(array([0, 0, 1, ..., 0, 1, 1]), array([1, 1, 0, ..., 1, 0, 1]))

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)




In [None]:
tokenizer.word_index

{'<OOV>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'br': 8,
 'in': 9,
 'it': 10,
 'i': 11,
 'this': 12,
 'that': 13,
 'was': 14,
 'as': 15,
 'for': 16,
 'with': 17,
 'movie': 18,
 'but': 19,
 'film': 20,
 'on': 21,
 'not': 22,
 'you': 23,
 'are': 24,
 'his': 25,
 'have': 26,
 'be': 27,
 'one': 28,
 'he': 29,
 'all': 30,
 'at': 31,
 'by': 32,
 'an': 33,
 'they': 34,
 'so': 35,
 'from': 36,
 'who': 37,
 'like': 38,
 'or': 39,
 'just': 40,
 'her': 41,
 'out': 42,
 'about': 43,
 'if': 44,
 "it's": 45,
 'has': 46,
 'there': 47,
 'some': 48,
 'what': 49,
 'good': 50,
 'more': 51,
 'very': 52,
 'when': 53,
 'up': 54,
 'no': 55,
 'time': 56,
 'my': 57,
 'even': 58,
 'she': 59,
 'would': 60,
 'which': 61,
 'only': 62,
 'story': 63,
 'really': 64,
 'see': 65,
 'their': 66,
 'had': 67,
 'can': 68,
 'me': 69,
 'well': 70,
 'were': 71,
 'we': 72,
 'than': 73,
 'much': 74,
 'bad': 75,
 'been': 76,
 'do': 77,
 'get': 78,
 'great': 79,
 'also': 80,
 'will': 81,
 'other': 82,
 '

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train[0]

'That\'s what I kept asking myself during the many fights, screaming matches, swearing and general mayhem that permeate the 84 minutes. The comparisons also stand up when you think of the one-dimensional characters, who have so little depth that it is virtually impossible to care what happens to them. They are just badly written cyphers for the director to hang his multicultural beliefs on, a topic that has been done much better in other dramas both on TV and the cinema.<br /><br />I must confess, I\'m not really one for spotting bad performances during a film, but it must be said that Nichola Burley (as the heroine\'s slutty best friend) and Wasim Zakir (as the nasty, bullying brother) were absolutely terrible. I don\'t know what acting school they graduated from, but if I was them I\'d apply for a full refund post haste. Only Samina Awan in the lead role manages to impress in a cast of so-called British talent that we\'ll probably never hear from again. At least, that\'s the hope. Ne

In [None]:
X_train_seq[0]

[198,
 49,
 11,
 802,
 2161,
 535,
 303,
 2,
 108,
 1908,
 2039,
 4325,
 6551,
 3,
 816,
 4845,
 13,
 1,
 2,
 1,
 228,
 2,
 6191,
 80,
 783,
 54,
 53,
 23,
 102,
 5,
 2,
 28,
 2010,
 103,
 37,
 26,
 35,
 120,
 1131,
 13,
 10,
 7,
 2367,
 1191,
 6,
 455,
 49,
 553,
 6,
 93,
 34,
 24,
 40,
 917,
 401,
 1,
 16,
 2,
 167,
 6,
 2994,
 25,
 1,
 4360,
 21,
 4,
 2946,
 13,
 46,
 76,
 222,
 74,
 127,
 9,
 82,
 3156,
 196,
 21,
 240,
 3,
 2,
 448,
 8,
 8,
 11,
 206,
 4983,
 145,
 22,
 64,
 28,
 16,
 1,
 75,
 366,
 303,
 4,
 20,
 19,
 10,
 206,
 27,
 308,
 13,
 1,
 1,
 15,
 2,
 1,
 1,
 116,
 440,
 3,
 1,
 1,
 15,
 2,
 1705,
 1,
 616,
 71,
 425,
 382,
 11,
 90,
 119,
 49,
 113,
 371,
 34,
 1,
 36,
 19,
 44,
 11,
 14,
 93,
 486,
 6831,
 16,
 4,
 365,
 1,
 1183,
 1,
 62,
 1,
 1,
 9,
 2,
 471,
 217,
 1015,
 6,
 4161,
 9,
 4,
 175,
 5,
 35,
 441,
 698,
 624,
 13,
 3749,
 238,
 112,
 849,
 36,
 171,
 31,
 220,
 198,
 2,
 429,
 368,
 56,
 3766,
 4,
 279,
 1,
 8,
 8,
 158,
 1708,
 188,
 7,
 2,
 1,
 1,
 7

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)


In [None]:
X_train_pad

array([[ 145, 1084,   17, ...,  206,  352, 3857],
       [ 311,    6,  426, ...,   90,  104,   10],
       [   0,    0,    0, ...,    3,  711,   63],
       ...,
       [   0,    0,    0, ..., 1642,    3,  604],
       [   0,    0,    0, ...,  126, 7286,    1],
       [   0,    0,    0, ...,   71,   74, 2063]], dtype=int32)

In [None]:
embedding_vector_features = 64 # feature representation
model = Sequential()
model.add(Embedding(VOCAB_SIZE, embedding_vector_features, input_length =MAX_LEN ))
model.add(SimpleRNN(100))
model.add(Dense(1, activation = "sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
print(model.summary())



None


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
MAX_LEN = 200   # sequence length

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE,
              output_dim=EMBEDDING_DIM,
              input_shape=(MAX_LEN,)),
    SimpleRNN(64),
    Dense(1, activation='sigmoid')
])

model.summary()


  super().__init__(**kwargs)


In [None]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
print(model.summary())

None


In [None]:
# model training
model.fit(X_train_pad, y_train, validation_data = (X_test_pad, y_test), epochs = 10, batch_size = 64)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 95ms/step - accuracy: 0.6319 - loss: 0.6122 - val_accuracy: 0.7962 - val_loss: 0.4504
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 93ms/step - accuracy: 0.8212 - loss: 0.4017 - val_accuracy: 0.8180 - val_loss: 0.4287
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 91ms/step - accuracy: 0.9113 - loss: 0.2234 - val_accuracy: 0.8144 - val_loss: 0.4857
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 91ms/step - accuracy: 0.9591 - loss: 0.1188 - val_accuracy: 0.7993 - val_loss: 0.5603
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 90ms/step - accuracy: 0.9817 - loss: 0.0558 - val_accuracy: 0.8027 - val_loss: 0.6614
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 89ms/step - accuracy: 0.9871 - loss: 0.0404 - val_accuracy: 0.7886 - val_loss: 0.8166
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7b438b8d54c0>

In [None]:
y_pred = model.predict(X_test_pad)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step


In [None]:
y_pred = np.where(y_pred>0.4, 1,0)
y_pred

array([[1],
       [0],
       [1],
       ...,
       [1],
       [0],
       [1]])

In [None]:
y_test

array([1, 1, 0, ..., 1, 0, 1])

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,3119,1842
1,891,4148


In [None]:
accuracy_score(y_test, y_pred)

0.7267

In [None]:
!pip install streamlit pyngrok tensorflow nltk




In [None]:
import pickle

model.save("sentiment_model.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)




In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# -------------------------------
# Load model and tokenizer
# -------------------------------
model = load_model("sentiment_model.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

MAX_LEN = 200   # same as training

# -------------------------------
# Streamlit UI
# -------------------------------
st.set_page_config(page_title="Sentiment Analysis App", layout="centered")

st.title("🧠 Sentiment Analysis Web App")
st.write("Binary Classification (Positive / Negative)")
st.write("Model Accuracy: **64%**")

# Text input
user_text = st.text_area("Enter your review or sentence:", height=150)

# Prediction button
if st.button("Predict Sentiment"):

    if len(user_text.strip()) == 0:
        st.warning("Please enter some text.")
    else:
        # Tokenization
        seq = tokenizer.texts_to_sequences([user_text])
        padded = pad_sequences(seq, maxlen=MAX_LEN, padding="post")

        # Prediction
        pred = model.predict(padded)[0][0]

        # Decision threshold
        sentiment = "Positive 😊" if pred > 0.5 else "Negative 😞"

        # Display result
        st.subheader("Prediction Result")
        st.write("Sentiment:", sentiment)
        st.write("Confidence Score:", round(float(pred), 3))


Overwriting app.py


In [None]:
from pyngrok import ngrok


In [None]:
!streamlit run app.py &>/content/logs.txt &


In [None]:
# Replace 'YOUR_AUTHTOKEN_HERE' with your actual token from https://dashboard.ngrok.com/get-started/your-authtoken
import os
from pyngrok import ngrok

AUTHTOKEN = "YOUR_AUTHTOKEN_HERE"
ngrok.set_auth_token(AUTHTOKEN)

public_url = ngrok.connect(8501)
public_url

ERROR:pyngrok.process.ngrok:t=2026-02-06T06:27:28+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: YOUR_AUTHTOKEN_HERE\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: YOUR_AUTHTOKEN_HERE\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n.