In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
data=pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv',nrows=10000)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


## Text Preprocessing

In [5]:
data['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
#define a function to apply on data
def apply_on_data(function):
  data['review']=data['review'].map(function)
  return data['review'][1]

In [7]:
import re
def remove_htmltags(text):
  pattern=re.compile('<.*?>')
  return pattern.sub(r'',text)


In [8]:
apply_on_data(remove_htmltags)

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [9]:
#function to remove url
def remove_url(text):
  pattern=re.compile(r'\s*https?://\S+(\s+|$)')
  return pattern.sub(r'',text)

In [10]:
remove_url('https://www.google.com this is url')

'this is url'

In [11]:
apply_on_data(remove_url)

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [12]:
import string
def remove_punctuations(text):
  for w in string.punctuation:
    text=text.replace(w,'')
  return text



In [13]:
remove_punctuations(data['review'][0])

'One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked They are right as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to manyAryans Muslims gangstas Latinos Christians Italians Irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare Forget pretty pictur

In [14]:
apply_on_data(remove_punctuations)

'A wonderful little production The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari but he has all the voices down pat too You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece A masterful production about one of the great masters of comedy and his life The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets particularly of their flat with Halliwells murals decorating every surface are terribly well done'

In [15]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
def remove_stopwords(text):
  new_text=[]
  for w in text.split():
    if w in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(w)
  return " ".join(new_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
remove_stopwords(data['review'][1])

'A wonderful little production The filming technique   unassuming  oldtimeBBC fashion  gives  comforting  sometimes discomforting sense  realism   entire piece The actors  extremely well chosen Michael Sheen    got   polari      voices  pat  You  truly see  seamless editing guided   references  Williams diary entries     well worth  watching     terrificly written  performed piece A masterful production  one   great masters  comedy   life The realism really comes home   little things  fantasy   guard  rather  use  traditional dream techniques remains solid  disappears It plays   knowledge   senses particularly   scenes concerning Orton  Halliwell   sets particularly   flat  Halliwells murals decorating every surface  terribly well done'

In [17]:
apply_on_data(remove_stopwords)

'A wonderful little production The filming technique   unassuming  oldtimeBBC fashion  gives  comforting  sometimes discomforting sense  realism   entire piece The actors  extremely well chosen Michael Sheen    got   polari      voices  pat  You  truly see  seamless editing guided   references  Williams diary entries     well worth  watching     terrificly written  performed piece A masterful production  one   great masters  comedy   life The realism really comes home   little things  fantasy   guard  rather  use  traditional dream techniques remains solid  disappears It plays   knowledge   senses particularly   scenes concerning Orton  Halliwell   sets particularly   flat  Halliwells murals decorating every surface  terribly well done'

In [18]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [19]:
data['sentiment']=data['sentiment'].map({
    'positive':1,
    'negative':0
})
data['sentiment'].head()

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [20]:
X=data['review']
y=data['sentiment']

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
len(X_train),len(y_train),len(X_test),len(y_test)

(8000, 8000, 2000, 2000)

In [22]:
tokenizer=Tokenizer(oov_token='nothing')
tokenizer.fit_on_texts(X_train)

In [23]:
sequence=tokenizer.texts_to_sequences(X_train)
padded=pad_sequences(sequence,padding='post',maxlen=100)

In [24]:
training_padded=np.array(padded)
training_padded[1]

array([   56,   109,   477,   950,  3530, 16510,  8755,   185,  1191,
           3,  3531,  2134,  5977,    69,  1606,   399,    67,    30,
           7,  3897,     4,  7795,  5773,     4, 32014,  5773,    23,
        1234,   157,   136, 32015, 32016, 19265,    40,     4,  3133,
         298,     4,  2367, 32017,  6737,     4,   165,   298,   868,
          40,  1967,   157,   751,  5771,  5773, 32018,  5978, 23728,
        1234,     5,  1289,  2518, 19266,  1614,  2408,  5397,  4070,
        5771,    12,   194,  1506, 19267,  2703,    23,   164,   345,
        8251, 11832,   253,  1197,  3134,   129,    10, 16511, 11833,
        4303,   630,    52,   195,   345, 12963, 19268,   497,   763,
         825, 32019,  7796,  3253,   575,  1721,    66,    27,  1397,
         304], dtype=int32)

In [25]:
model=keras.Sequential([
    keras.layers.Embedding(8000,24,input_length=100),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 24)           192000    
                                                                 
 global_average_pooling1d (  (None, 24)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                400       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 192417 (751.63 KB)
Trainable params: 192417 (751.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics='accuracy'
)

In [27]:
model.fit(training_padded,y_train,epochs=30,verbose=2)

Epoch 1/30
250/250 - 40s - loss: 0.6374 - accuracy: 0.7296 - 40s/epoch - 160ms/step
Epoch 2/30
250/250 - 15s - loss: 0.3610 - accuracy: 0.8844 - 15s/epoch - 61ms/step
Epoch 3/30
250/250 - 8s - loss: 0.2207 - accuracy: 0.9224 - 8s/epoch - 33ms/step
Epoch 4/30
250/250 - 5s - loss: 0.1564 - accuracy: 0.9513 - 5s/epoch - 19ms/step
Epoch 5/30
250/250 - 5s - loss: 0.1123 - accuracy: 0.9672 - 5s/epoch - 21ms/step
Epoch 6/30
250/250 - 5s - loss: 0.0805 - accuracy: 0.9795 - 5s/epoch - 19ms/step
Epoch 7/30
250/250 - 3s - loss: 0.0558 - accuracy: 0.9889 - 3s/epoch - 11ms/step
Epoch 8/30
250/250 - 2s - loss: 0.0381 - accuracy: 0.9945 - 2s/epoch - 10ms/step
Epoch 9/30
250/250 - 2s - loss: 0.0269 - accuracy: 0.9965 - 2s/epoch - 8ms/step
Epoch 10/30
250/250 - 3s - loss: 0.0192 - accuracy: 0.9980 - 3s/epoch - 12ms/step
Epoch 11/30
250/250 - 3s - loss: 0.0136 - accuracy: 0.9990 - 3s/epoch - 11ms/step
Epoch 12/30
250/250 - 1s - loss: 0.0095 - accuracy: 0.9995 - 1s/epoch - 5ms/step
Epoch 13/30
250/250 - 

<keras.src.callbacks.History at 0x7d624fd42c50>

In [28]:
sentence = ["best movie. watch it again"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
print(model.predict(padded))

[[0.9405478]]


In [29]:
sentence = ["very bad movie, never watch it again"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
print(model.predict(padded))

[[0.35041112]]
