List of libraries used for the project

In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras._tf_keras.keras.models import Sequential
from keras._tf_keras.keras.layers import Dense, Embedding,LSTM  
from keras._tf_keras.keras.preprocessing.text import Tokenizer 
from keras._tf_keras.keras.preprocessing.sequence import pad_sequences
import joblib
import gradio as gr


Preparing the dataset

In [8]:
data = pd.read_csv("../Datasets/IMDB_Dataset.csv")


In [9]:
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
data.tail()


Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [11]:
data.shape


(50000, 2)

Let us see how many positive reviews are there and how many of them are negative

In [12]:
data["sentiment"].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

One hot encoding 
Label encoder
positive -> 1
negative -> 0 

In [13]:
pd.set_option('future.no_silent_downcasting', True) #future behaviour of downcasting when using replace, since its deprecated atm

data.replace({"sentiment":{"positive": 1,"negative": 0}},inplace=True)

display(data.head())
display(data.tail())
data["sentiment"].value_counts()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


sentiment
1    25000
0    25000
Name: count, dtype: int64

LSTM -> Long Short Term Memoy
RNN -> Textual Data

In [14]:
train_data, test_data = train_test_split(data,test_size=0.2,random_state=42)
print(train_data.shape)
print(test_data.shape)


(40000, 2)
(10000, 2)


In [38]:
tokenizer = Tokenizer(num_words=5000 ) #random number
tokenizer.fit_on_texts(train_data["review"])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)


In [39]:
X_train


array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [43]:
Y_train = train_data["sentiment"].astype(int)
Y_test = test_data["sentiment"].astype(int)

Y_train




39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

Model Building

In [18]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)


I0000 00:00:1739741387.888906    5063 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1739741388.143333    5063 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1739741388.145178    5063 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [None]:
model = Sequential()

model.add(Embedding(input_dim=5000,output_dim=128))
model.add(LSTM(units=128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))



In [46]:
model.summary()


In [47]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])




In [50]:
result = model.fit(X_train,Y_train,batch_size=64,epochs=5,validation_split=0.2)


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 226ms/step - accuracy: 0.8513 - loss: 0.3535 - val_accuracy: 0.8680 - val_loss: 0.3131
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 230ms/step - accuracy: 0.8932 - loss: 0.2643 - val_accuracy: 0.8798 - val_loss: 0.2957
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 228ms/step - accuracy: 0.9137 - loss: 0.2201 - val_accuracy: 0.8820 - val_loss: 0.3058
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 229ms/step - accuracy: 0.9288 - loss: 0.1872 - val_accuracy: 0.8745 - val_loss: 0.3264
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 227ms/step - accuracy: 0.9408 - loss: 0.1585 - val_accuracy: 0.8824 - val_loss: 0.3183


In [55]:
model.save("movies_model.keras")
joblib.dump(tokenizer,"tokenizer.pkl")


['tokenizer.pkl']

In [None]:


loss, accuracy = model.evaluate(X_test,Y_test)

print("loss: ",loss)
print("accuracy: ",accuracy)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 72ms/step - accuracy: 0.8828 - loss: 0.3015
loss:  0.2967463731765747
accuracy:  0.8876000046730042


Building predictive system 

In [67]:
def pred_system (review):
    sequences = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequences=sequences,maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = "positive" if prediction[0][0]> 0.5 else "negative"
    return sentiment, prediction[0][0]


In [68]:
pred_system("This movie was fantastic and amazing")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step


('positive', 0.9899099)

In [69]:
pred_system("This movie was awful")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step


('negative', 0.0018804373)

In [72]:
pred_system("This movie was average")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step


('negative', 0.3241145)

gradio section 

In [None]:
title = "movie sentinent analysis app"

app= gr.Interface(fn = pred_system,inputs="textbox",outputs="textbox",title=title)

app.launch(share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://d457c78c08c4cd9892.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


