In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

**Load data**

---


You can download the data as[ CSV format from Kaggle](https://www.kaggle.com/datasets/columbine/imdb-dataset-sentiment-analysis-in-csv-format).  So that you don't need to think about data compilation much. Just download the data and then upload the Train.csv in file area. If you want to use data from text file and learn more, try this [official tutorial](https://www.tensorflow.org/tutorials/keras/text_classification).

In [None]:
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [None]:
x=df['text']
y=df['label']

**Split the data into training and testing set**

---



In [None]:
X_train, X_test , Y_train, Y_test = train_test_split(x,y, test_size=0.2, random_state=50)

In [None]:

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# print word index aka dictionary
print(word_index)



In [None]:

vocab_size = 5000
embedding_dim = 32
max_length = 75
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

train_seq = tokenizer.texts_to_sequences(X_train)
train_pad_seq = pad_sequences(train_seq,maxlen=max_length,truncating=trunc_type, padding=pad_type)



valid_seq = tokenizer.texts_to_sequences(X_test)
valid_pad_seq = pad_sequences(valid_seq,maxlen=max_length)

training_labels_final = np.array(Y_train)
validation_labels_final = np.array(Y_test)

In [None]:

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 75, 32)            160000    
                                                                 
 flatten_1 (Flatten)         (None, 2400)              0         
                                                                 
 dense_3 (Dense)             (None, 6)                 14406     
                                                                 
 dense_4 (Dense)             (None, 1)                 7         
                                                                 
Total params: 174,413
Trainable params: 174,413
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 10
history = model.fit(train_pad_seq, training_labels_final, epochs=num_epochs,
                    validation_data=(valid_pad_seq, validation_labels_final))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Predict a review**

---

Now you can test this model. We need to tokenize the test data before pridicting. Try to change or add more review to test_reviews list here:

In [None]:

test_reviews = ["This movie is not good at all. I did not enjoyed much",
                "One of the best movie I have ever seen. Recommend everyone to watch this movie"]


# Create the sequences

test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_padded = pad_sequences(test_sequences, padding=pad_type, maxlen=max_length)

classes = model.predict(test_padded)

# Closer to 1 means positive
for x in range(len(test_reviews)):
  print(test_reviews[x])
  print(classes[x])
  print('\n')


This movie is not good at all. I did not enjoyed much
[0.4522577]


One of the best movie I have ever seen. Recommend everyone to watch this movie
[0.72242755]


