In [24]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [9]:
movie_reviews = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the worst movie ever', 'sentiment': 'negative'}
    ]
df = pd.DataFrame(movie_reviews)

In [10]:
df

Unnamed: 0,review,sentiment
0,this is the best movie,positive
1,i recommend you watch this movie,positive
2,it was waste of money and time,negative
3,the worst movie ever,negative


## Word Tokenize and Vocab_size

In [11]:
def get_vocab_to_int(df):
    d = {}
    vocab = set()
    df['review'].str.split(' ').apply(vocab.update)
    for idx, word in enumerate(vocab):
        d[word] = idx
    
    return d

vocab_to_int = get_vocab_to_int(df)
vocab_size = len(vocab_to_int)

In [12]:
print(vocab_size)

vocab_to_int

18


{'movie': 0,
 'was': 1,
 'worst': 2,
 'best': 3,
 'i': 4,
 'of': 5,
 'it': 6,
 'time': 7,
 'the': 8,
 'watch': 9,
 'ever': 10,
 'recommend': 11,
 'and': 12,
 'you': 13,
 'this': 14,
 'is': 15,
 'money': 16,
 'waste': 17}

## sentences change with tokenize

In [15]:
reviews = df['review'].tolist()

encoded_reviews = []
for review in reviews:
    tokens = review.split(' ')
    review_encoding = []
    for token in tokens:
        review_encoding.append(vocab_to_int[token])
    encoded_reviews.append(review_encoding)

In [16]:
print(encoded_reviews[0])
print(encoded_reviews[1])
print(encoded_reviews[2])
print(encoded_reviews[3])

[14, 15, 8, 3, 0]
[4, 11, 13, 9, 14, 0]
[6, 1, 17, 5, 16, 12, 7]
[8, 2, 0, 10]


In [17]:
def get_max_length(df):
    max_length = 0
    for row in df['review']:
        if len(row.split(' ')) > max_length:
            max_length = len(row.split(' '))
    
    return max_length

max_length = get_max_length(df)

In [18]:
max_length

7

## pad_sequence

In [20]:
padded_reviews_encoding =\
   pad_sequences(encoded_reviews, maxlen = max_length, padding = 'post')

print(padded_reviews_encoding)

[[14 15  8  3  0  0  0]
 [ 4 11 13  9 14  0  0]
 [ 6  1 17  5 16 12  7]
 [ 8  2  0 10  0  0  0]]


In [21]:
sentiments = df['sentiment'].tolist()

def sentiment_encoded(sentiment):
    if sentiment == 'positive':
        return [1, 0]
    else:
        return [0, 1]
    
encoded_sentiment = [sentiment_encoded(sentiment) for sentiment in sentiments]

print(encoded_sentiment)

[[1, 0], [1, 0], [0, 1], [0, 1]]


## Make Model

In [25]:
model = Sequential()
model.add(Embedding(vocab_size, 3, input_length = max_length))
model.add(SimpleRNN(32))
model.add(Dense(2, activation = 'softmax'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = 'acc')

X_train = np.array(padded_reviews_encoding)
Y_train = np.array(encoded_sentiment)

model.fit(X_train, Y_train, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe700a13520>