# Sentiment Analysis with Pretrained Word Vectors

### Loading Libraries

In [31]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warning
import warnings

# Path
from pathlib import Path

# Scikit-Learn
from sklearn.metrics import roc_auc_score

# TensorFlow
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [52]:
np.random.seed(42)

idx = pd.IndexSlice

sns.set_style('whitegrid')

In [54]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using GPU


In [77]:
results_path = Path('results', 'sentiment_imdb')

if not results_path.exists():
    results_path.mkdir(parents=True)

### Loading Reviews

In [89]:
path = Path('aclImdb')

In [115]:
files = path.rglob('*.txt')

data = []

In [117]:
for f in files:
    if f.stem.startswith(('urls_', 'imdbEr')):
        continue

    parts = f.parent.parts

    if len(parts) != 3:
        continue

    _, data_set, outcome = parts
    if outcome == 'unsup':
        continue

    review = f.read_text(encoding='latin1')
    data.append([data_set, int(outcome == 'pos'), review])

In [122]:
data = pd.DataFrame(data, columns=['dataset', 'label', 'review'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dataset  50000 non-null  object
 1   label    50000 non-null  int64 
 2   review   50000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [124]:
train_data = data.loc[data.dataset=='train', ['label', 'review']]

test_data = data.loc[data.dataset=='test', ['label', 'review']]

In [126]:
train_data.label.value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [128]:
test_data.label.value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

### Preparing Data

#### Tokenizer

In [131]:
num_words = 10000

t = Tokenizer(num_words=num_words, 
              lower=True, 
              oov_token=2)

t.fit_on_texts(train_data.review)

In [133]:
vocab_size = len(t.word_index) + 1

vocab_size

88586

In [135]:
train_data_encoded = t.texts_to_sequences(train_data.review)

test_data_encoded = t.texts_to_sequences(test_data.review)

In [137]:
max_length = 100

### Padding Sequences

In [144]:
# Trained Padded Sequences
X_train_padded = pad_sequences(train_data_encoded, 
                            maxlen=max_length, 
                            padding='post',
                           truncating='post')

y_train = train_data['label']

X_train_padded.shape

(25000, 100)

In [146]:
# Tested Padded Sequences
X_test_padded = pad_sequences(test_data_encoded, 
                            maxlen=max_length, 
                            padding='post',
                           truncating='post')

y_test = test_data['label']

X_test_padded.shape

(25000, 100)

### Loading Embeddings

In [153]:
glove_path = Path('glove', 'glove.6B.100d.txt')

embeddings_index = dict()

for line in glove_path.open(encoding='latin1'):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index[word] = coefs

In [155]:
print('Loaded {:,d} word vectors.'.format(len(embeddings_index)))

Loaded 399,883 word vectors.


In [157]:
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [159]:
embedding_matrix.shape

(88586, 100)

### Defining Model Architecture

In [171]:
embedding_size = 100

In [179]:
# Building Network
rnn = Sequential([
    Embedding(input_dim=vocab_size, 
              output_dim= embedding_size, 
              input_length=max_length,
              weights=[embedding_matrix], 
              trainable=False),
    GRU(units=32,  dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])



In [181]:
rnn.build(input_shape=(None, max_length))
rnn.summary()

In [185]:
# Compiling Network
rnn.compile(loss='binary_crossentropy',
            optimizer='RMSProp',
            metrics=['accuracy', 
                     tf.keras.metrics.AUC(name='AUC')])

In [187]:
rnn_path = (results_path / 'lstm.pretrained.h5').as_posix()

checkpointer = ModelCheckpoint(filepath=rnn_path,
                               verbose=1,
                               monitor='val_AUC',
                               mode='max',
                               save_best_only=True)

In [189]:
early_stopping = EarlyStopping(monitor='val_AUC',
                               patience=5,
                               mode='max',
                               restore_best_weights=True)

In [None]:
training = rnn.fit(X_train_padded,
                   y_train,
                   batch_size=32,
                   epochs=100,
                   validation_data=(X_test_padded,
                                    y_test),
                   callbacks=[early_stopping,
                              checkpointer],
                   verbose=1)

Epoch 1/100


2025-05-05 09:21:26.943740: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m519/782[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m31:38[0m 7s/step - AUC: 0.5508 - accuracy: 0.5383 - loss: 0.6943  

In [None]:
# Getting Prediction Score
y_score = rnn.predict(X_test_padded)

roc_auc_score(y_score=y_score.squeeze(), y_true=y_test)

In [None]:
df = pd.DataFrame(training.history)

best_auc = df.val_AUC.max()
best_acc = df.val_accuracy.max()

fig, axes = plt.subplots(ncols=2, figsize=(14,4))
df.index = df.index.to_series().add(1)
df[['AUC', 'val_AUC']].plot(ax=axes[0], 
                            title=f'AUC | Best: {best_auc:.4f}', 
                            legend=False, 
                            xlim=(1, 33),
                            ylim=(.7, .95))

axes[0].axvline(df.val_AUC.idxmax(), ls='--', lw=1, c='k')
df[['accuracy', 'val_accuracy']].plot(ax=axes[1], 
                                              title=f'Accuracy | Best: {best_acc:.2%}', 
                                              legend=False, 
                                              xlim=(1, 33),
                                      ylim=(.7, .9))
axes[1].axvline(df.val_accuracy.idxmax(), ls='--', lw=1, c='k')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('AUC')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
fig.suptitle('Sentiment Analysis - Pretrained Vectors', fontsize=14)
fig.legend(['Train', 'Validation'], loc='center right')

sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)
fig.savefig(results_path / 'imdb_pretrained', dpi=300);
plt.grid()
plt.show()