# Sentence Classification using LSTM and Pretrained Word2Vec

We will train and test sentence classification using LSTM, and Pretrained Word2Vec.
You can find visualization of our code below.

The most benefits from Pretrained Word Embedding is<br>even unseen words during traning can be predicted well, since pretrained word embedding already trained with larger data set than your train data.

for example, below example also can be predicted well, even "this", "best", "show" were not in the train data.<br> Since "this" is similar to "it", "best" is similar to "good" and "show" is similar to "movie" in pretrained word embedding vector.

We will use LSTM, so we can generate sentence vector with sequence of word embedding.<br>LSTM is advanced RNN which is powerful on long sequence input.

In [2]:
#Import Libraries
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd

import tensorflow_hub as hub
import numpy as np

In [11]:
# Load Pretrained Word2Vec
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

In [12]:
embed(["jump"])

<tf.Tensor: shape=(1, 250), dtype=float32, numpy=
array([[-0.00067931,  0.06408308,  0.0495666 ,  0.05925972, -0.01335577,
         0.04213884, -0.0608239 ,  0.04894666, -0.07230948, -0.07469559,
        -0.03064002,  0.05388073, -0.06971022,  0.00333765, -0.10572395,
         0.00675618,  0.0339472 ,  0.01811906,  0.01162543, -0.00152522,
         0.01761709,  0.05105859, -0.10164404, -0.02336321, -0.04341478,
        -0.00348344,  0.03789383,  0.07577708,  0.02933779,  0.12435406,
         0.11630959, -0.11019364,  0.01011824, -0.02797017,  0.05135059,
        -0.04368721,  0.01803273,  0.11828327,  0.0704509 , -0.02574026,
        -0.06336565, -0.12046516,  0.00759061,  0.05887634,  0.07620929,
         0.08507296, -0.00164223,  0.09397715, -0.07488727,  0.00359939,
         0.04454356, -0.03056428, -0.02355767,  0.10840571, -0.04652384,
         0.02701746, -0.03696478,  0.01072006,  0.04460838,  0.01600937,
         0.02497451, -0.03835671, -0.05878492,  0.00480731, -0.00065042,
 

In [13]:
def get_max_length(df):
    """
    get max token counts from train data,
    so we use this number as fixed length input to RNN cell
    """
    max_length = 0
    for row in df['review']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length

def get_word2vec_enc(reviews):
    """
    get word2vec value for each word in sentence.
    concatenate word in numpy array, so we can use it as RNN input
    """
    encoded_reviews = []
    for review in reviews:
        tokens = review.split(" ")
        word2vec_embedding = embed(tokens)
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews

def get_padded_encoded_reviews(encoded_reviews):
    """
    for short sentences, we prepend zero padding so all input to RNN has same length
    """
    padded_reviews_encoding = []
    for enc_review in encoded_reviews:
        zero_padding_cnt = maxLength - enc_review.shape[0]
        pad = np.zeros((1, 250))
        for i in range(zero_padding_cnt):
            enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return padded_reviews_encoding

def sentiment_encode(sentiment):
    """
    return one hot encoding for Y value
    """
    if sentiment == 'positive':
        return [1,0]
    else:
        return [0,1]

def preprocess(df):
    """
    encode text value to numeric value
    """
    # encode words into word2vec
    reviews = df['review'].tolist()

    encoded_reviews = get_word2vec_enc(reviews)
    padded_encoded_reviews = get_padded_encoded_reviews(encoded_reviews)
    # encoded sentiment
    sentiments = df['sentiment'].tolist()
    encoded_sentiment = [sentiment_encode(sentiment) for sentiment in sentiments]
    X = np.array(padded_encoded_reviews)
    Y = np.array(encoded_sentiment)
    return X, Y

# Preprocess (encode text to number)

In [14]:
movieReviewsTrain = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the best acting but not movie', 'sentiment': 'negative'}
    ]
df = pd.DataFrame(movieReviewsTrain)

df

Unnamed: 0,review,sentiment
0,this is the best movie,positive
1,i recommend you watch this movie,positive
2,it was waste of money and time,negative
3,the best acting but not movie,negative


In [15]:
# max_length is used for max sequence of input
maxLength = get_max_length(df)

maxLength

7

In [16]:
trainX, trainY = preprocess(df)
print(trainX.shape)
print(trainY.shape)
trainY

(4, 7, 250)
(4, 2)


array([[1, 0],
       [1, 0],
       [0, 1],
       [0, 1]])

In [17]:
trainY[1].size

2

# Build Model

In [18]:
# LSTM model
model = Sequential()
# model.add(RNN(16))
model.add(LSTM(32))
# model.add(LSTM(8))
model.add(Dense(2, activation='softmax'))

In [19]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train

In [20]:
print('Train...')
model.fit(trainX, trainY,epochs=50)

Train...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.2500 - loss: 0.7054
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.2500 - loss: 0.6970
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.5000 - loss: 0.6888
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.7500 - loss: 0.6808
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.7500 - loss: 0.6729
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.7500 - loss: 0.6648
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.7500 - loss: 0.6566
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.7500 - loss: 0.6482
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x24538dad390>

In [21]:
model.summary()


# Test
your model can predict correctly even for unseen words from training.
This is the most benefit of using pretrained word embedding.
Why? pretrained embedding will encode [better], [nice] to similar vector of [best]
even if these words were not in train.
therefore, the input vector to RNN is similar, so correct answers for even these unseen words.

In [22]:
"""
movie_reviews_train = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the best acting but no movie', 'sentiment': 'negative'}
    ]
"""
movieReviewsTest = [
         {'review': 'it is better movie', 'sentiment': 'positive'},
         {'review': 'i suggest you see this movie', 'sentiment': 'positive'},
         {'review': 'it was just throwing 20 dollars away', 'sentiment': 'negative'},
         {'review': 'worse than any show', 'sentiment': 'negative'},
         {'review': 'nice movie, so love it', 'sentiment': 'positive'},
         {'review': 'terrible', 'sentiment': 'negative'}
    ]
testDf = pd.DataFrame(movieReviewsTest)

testX, testY = preprocess(testDf)

score, acc = model.evaluate(testX, testY, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

1/1 - 1s - 1s/step - accuracy: 0.6667 - loss: 0.5422
Test score: 0.5421934127807617
Test accuracy: 0.6666666865348816
