In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("IMDB_Dataset.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
print(df['sentiment'].value_counts())

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [6]:
df['sentiment'] = df['sentiment'].map({
    'positive': 1,
    'negative': 0
})

In [7]:
X = df['review'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
y_train, y_test

(array([0, 0, 1, ..., 0, 1, 1], dtype=int64),
 array([1, 1, 0, ..., 1, 0, 1], dtype=int64))

In [9]:
VOCAB_SIZE = 10000
MAX_LEN = 200          #length of a review is 200 max

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")    #OOV out of vocablary tokens will be replaced with <OOV>
tokenizer.fit_on_texts(X_train)     #converts vored in to tokes

In [10]:
tokenizer.word_index

{'<OOV>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'br': 8,
 'in': 9,
 'it': 10,
 'i': 11,
 'this': 12,
 'that': 13,
 'was': 14,
 'as': 15,
 'for': 16,
 'with': 17,
 'movie': 18,
 'but': 19,
 'film': 20,
 'on': 21,
 'not': 22,
 'you': 23,
 'are': 24,
 'his': 25,
 'have': 26,
 'be': 27,
 'one': 28,
 'he': 29,
 'all': 30,
 'at': 31,
 'by': 32,
 'an': 33,
 'they': 34,
 'so': 35,
 'from': 36,
 'who': 37,
 'like': 38,
 'or': 39,
 'just': 40,
 'her': 41,
 'out': 42,
 'about': 43,
 'if': 44,
 "it's": 45,
 'has': 46,
 'there': 47,
 'some': 48,
 'what': 49,
 'good': 50,
 'more': 51,
 'very': 52,
 'when': 53,
 'up': 54,
 'no': 55,
 'time': 56,
 'my': 57,
 'even': 58,
 'she': 59,
 'would': 60,
 'which': 61,
 'only': 62,
 'story': 63,
 'really': 64,
 'see': 65,
 'their': 66,
 'had': 67,
 'can': 68,
 'me': 69,
 'well': 70,
 'were': 71,
 'we': 72,
 'than': 73,
 'much': 74,
 'bad': 75,
 'been': 76,
 'do': 77,
 'get': 78,
 'great': 79,
 'also': 80,
 'will': 81,
 'other': 82,
 '

In [15]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [17]:
X_train_pad

array([[ 145, 1084,   17, ...,  206,  352, 3857],
       [ 311,    6,  426, ...,   90,  104,   10],
       [   0,    0,    0, ...,    3,  711,   63],
       ...,
       [   0,    0,    0, ..., 1642,    3,  604],
       [   0,    0,    0, ...,  126, 7286,    1],
       [   0,    0,    0, ...,   71,   74, 2063]])

In [11]:
embedding_vector_features = 64 # feature representation
model = Sequential()
model.add(Embedding(VOCAB_SIZE, embedding_vector_features, input_length =MAX_LEN ))
model.add(SimpleRNN(100))
model.add(Dense(1, activation = "sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           640000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               16500     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 656,601
Trainable params: 656,601
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)