# Sentiment classification. IMDb dataset.

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
import gensim

In [2]:
print(tf.__version__)

2.1.0


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/capstone2/sentiment_analysis/imdb_dataset.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [0]:
from sklearn import preprocessing
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()
# Fit the encoder to the pandas column
le.fit(df['sentiment'])
# Apply the fitted encoder to the pandas column
df['sentiment'] = le.transform(df['sentiment']) 

In [6]:
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1


# Preprocess dataset.

In [7]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
nltk.download('punkt')
nltk.download('stopwords')

def preprocessor(text):
    '''Preprocessor function to tokenize, 
    remove the markup and join back to a string. '''

    ## Remove HTML markup and standardize emotion characters
    #text = re.sub('<[^>]*>', '', text)
    #emotions = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
    #                       text)
    #text = (re.sub('[\W]+', ' ', text.lower()) +
    #        ' '.join(emotions).replace('-', ''))
    
    # tokenize
    tokens = word_tokenize(text)
    
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    # remove punctutation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove non-alphabetic tokens
    words = [word for word in stripped if word.isalpha()]
    
    # remove stop words
    stop_words = stopwords.words('english')
    words = [w for w in words if w not in stop_words]
    
    # join back to a string
    seperator = ' '
    text = seperator.join([w for w in words])
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
df['review'] = df['review'].apply(preprocessor)

In [9]:
df.head(10)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1
5,probably alltime favorite movie story selfless...,1
6,sure would like see resurrection dated seahunt...,1
7,show amazing fresh innovative idea first aired...,0
8,encouraged positive comments film looking forw...,0
9,like original gut wrenching laughter like movi...,1


# Word2Vec model

In [0]:
def w2v_input(column):
    reviews = list()

    for index, row in df.iterrows():
        reviews.append(gensim.utils.simple_preprocess(row[column]))
    
    return reviews

In [32]:
reviews = w2v_input('review')
# build vocabulary and train model
w2v_model = gensim.models.Word2Vec(reviews,
                                   size=150,
                                   window=10,
                                   min_count=2, 
                                   workers=-1)
    
w2v_model.train(reviews, total_examples=len(reviews), epochs=100)

(0, 0)

In [33]:
# vocabulary size
w2v_vocabulary = list(w2v_model.wv.vocab)
print('Vocabulary size is: %d' %len(w2v_vocabulary))

Vocabulary size is: 67899


In [34]:
w2v_model.wv.most_similar('sad')

  if np.issubdtype(vec.dtype, np.int):


[('homeier', 0.34255775809288025),
 ('reworking', 0.32878434658050537),
 ('goneril', 0.3280695080757141),
 ('bluto', 0.3236783444881439),
 ('erie', 0.31672319769859314),
 ('idling', 0.31376826763153076),
 ('fastidious', 0.30772221088409424),
 ('proclaim', 0.29738378524780273),
 ('martialarts', 0.2972051203250885),
 ('fudges', 0.29405030608177185)]

In [35]:
w2v_model.wv.doesnt_match('woman king queen movie'.split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'movie'

# Create a dataset.

In [0]:
target = df.pop('sentiment')

ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values))

In [0]:
# Inspect:
for entry in ds_raw.take(3):
    tf.print(entry[0].numpy()[0][:50], entry[1])

# Train / validation / test split. 

In [0]:
tf.random.set_seed(1)

ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

# Find unique tokens.

In [0]:
from collections import Counter
import tensorflow_datasets as tfds

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

# Encoding each unique token into integers.

In [0]:
encoder = tfds.features.text.TokenTextEncoder(token_counts)

In [0]:
example_str = 'read watch although'
encoder.encode(example_str)

# Define the function for transformation.

In [0]:
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

# Wrap the encode function to a TensorFlow operator
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], 
                          Tout=(tf.int64, tf.int64))

In [0]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('Sequence length:', example[0].shape)
    
example

# Batching the dataset.

In [0]:
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))

valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

# RNN model.

In [0]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

# build the model
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size,
                              output_dim=embedding_dim),
    tf.keras.layers.Dense(20,
                          input_shape=(embedding_dim,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(20, 
                             return_sequences=False)),
    tf.keras.layers.Dense(2, 
                          activation='softmax')
])

bi_lstm_model.summary()

# compile and train:
bi_lstm_model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

history = bi_lstm_model.fit(train_data, 
                            validation_data=valid_data, 
                            epochs=6)

# evaluate on the test data:
test_results= bi_lstm_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(test_results[1]*100))

In [0]:
import matplotlib.pyplot as plt
def plot_train_valid(model_history):
    hist = model_history.history
    x_arr = np.arange(len(hist['loss'])) + 1

    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, hist['loss'], '-o', label='Train loss')
    ax.plot(x_arr, hist['val_loss'], '--<', label='Validation loss')
    ax.legend(fontsize=15)
    ax.set_xlabel('Epoch', size=15)
    ax.set_ylabel('Loss', size=15)

    ax = fig.add_subplot(1, 2, 2)
    ax.plot(x_arr, hist['accuracy'], '-o', label='Train acc.')
    ax.plot(x_arr, hist['val_accuracy'], '--<', label='Validation acc.')
    ax.legend(fontsize=15)
    ax.set_xlabel('Epoch', size=15)
    ax.set_ylabel('Accuracy', size=15)

    return plt.show()

In [0]:
plot_train_valid(history)