In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import re
from tqdm import tqdm
from tabulate import tabulate
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD

In [2]:
# Load pre-trained GloVe embeddings
glove_embeddings = {}
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

In [3]:
# Load pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Define a function to generate embeddings for a sentence
def generate_embedding(sentence, embedding_type):
    if embedding_type == 'glove':
        # Generate embedding using GloVe
        words = sentence.split()
        embedding = np.zeros(300)
        for word in words:
            if word in glove_embeddings:
                embedding += glove_embeddings[word]
        return embedding / len(words)
    elif embedding_type == 'bert':
        # Generate embedding using BERT
        tokens = tokenizer.encode(sentence, add_special_tokens=True, truncation=True)
        tokens_tensor = torch.tensor([tokens])
        with torch.no_grad():
            last_hidden_states = bert_model(tokens_tensor)[0]
        embedding = last_hidden_states.mean(dim=1).squeeze().numpy()
        return embedding
    else:
        raise ValueError('Invalid embedding type')

In [5]:
# Load dataset for sentence classification
df = pd.read_csv('imdb_reviews.csv')

In [6]:
# Data preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess_text(text):
    text = re.sub('<[^>]*>', '', text)  # remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    text = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]  # lemmatization
    return ' '.join(text)

In [8]:
df["review"] = df["review"].apply(preprocess_text)

In [9]:
le = LabelEncoder()
df["sentiment"] = le.fit_transform(df["sentiment"])

### Get Embeddings

In [10]:
# Generate GloVe embeddings for each sentence
glove_embeddings = df["review"].apply(lambda x: generate_embedding(x, 'glove'))
glove_embeddings = np.vstack(glove_embeddings)

In [11]:
# Generate BERT embeddings for each sentence
bert_embeddings = []
for sentence in tqdm(df["review"]):
    bert_embeddings.append(generate_embedding(sentence, 'bert'))
bert_embeddings = np.vstack(bert_embeddings)

100%|██████████| 50000/50000 [1:27:45<00:00,  9.50it/s]


In [12]:
# Split dataset into training and testing sets
train_size = int(0.8 * len(df))
train_data_glove, train_data_bert, train_labels = glove_embeddings[:train_size], bert_embeddings[:train_size], df["sentiment"][:train_size]
test_data_glove, test_data_bert, test_labels = glove_embeddings[train_size:], bert_embeddings[train_size:], df["sentiment"][train_size:]

### Metrics

In [13]:
def compute_eer(ytrue, ypred):
    """ Returns equal error rate (EER) and the corresponding threshold. """
    fpr, tpr, thr = roc_curve(ytrue, ypred)
    fnr = 1-tpr
    abs_diffs = np.abs(fpr - fnr)
    min_index = np.argmin(abs_diffs)
    eer = np.mean((fpr[min_index], fnr[min_index]))
    return eer, thr[min_index]

In [14]:
metric_list = ["accuracy", "f1-score", "AUC of ROC", "EER"]
metrics = dict()
metrics["glove"] = dict()
metrics["bert"] = dict()

for metric in metric_list:
    metrics["glove"][metric] = []
    metrics["bert"][metric] = []

## GloVe Embeddings Evaluation

### Neural Network

In [15]:
# Define the model
input_layer = Input(shape = (train_data_glove.shape[1],))
hidden_layer = Dense(64, activation = 'relu')(input_layer)
dropout_layer = Dropout(0.5)(hidden_layer)
output_layer = Dense(1, activation = 'sigmoid')(dropout_layer)
model_glove = Model(inputs = input_layer, outputs = output_layer)

In [16]:
# Compile the model
adam = Adam(learning_rate = 0.001)
model_glove.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

In [17]:
# Train the model on GloVe embeddings
history_glove = model_glove.fit(train_data_glove, train_labels, batch_size=128, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
# Evaluate the model on test set
y_pred_glove = model_glove.predict(test_data_glove)
y_pred_glove = np.round(y_pred_glove).astype(int)



In [19]:
metrics["glove"]["accuracy"].append(round(accuracy_score(test_labels, y_pred_glove), 3))
metrics["glove"]["f1-score"].append(round(f1_score(test_labels, y_pred_glove), 3))
metrics["glove"]["AUC of ROC"].append(round(roc_auc_score(test_labels, y_pred_glove), 3))
metrics["glove"]["EER"].append(round(compute_eer(test_labels, y_pred_glove)[0], 3))

### Logistic Regression

In [20]:
# Train logistic regression model on GloVe features
glove_model = LogisticRegression(max_iter=1000)
glove_model.fit(train_data_glove, train_labels)
glove_preds = glove_model.predict(test_data_glove)

In [21]:
metrics["glove"]["accuracy"].append(round(accuracy_score(test_labels, glove_preds), 3))
metrics["glove"]["f1-score"].append(round(f1_score(test_labels, glove_preds), 3))
metrics["glove"]["AUC of ROC"].append(round(roc_auc_score(test_labels, glove_preds), 3))
metrics["glove"]["EER"].append(round(compute_eer(test_labels, glove_preds)[0], 3))

## BERT Embeddings Evaluation

### Neural Network

In [23]:
# Define the model
input_layer = Input(shape = (train_data_bert.shape[1],))
hidden_layer = Dense(64, activation = 'relu')(input_layer)
dropout_layer = Dropout(0.5)(hidden_layer)
output_layer = Dense(1, activation = 'sigmoid')(dropout_layer)
model_bert = Model(inputs = input_layer, outputs = output_layer)

In [24]:
# Compile the model
adam = Adam(learning_rate = 0.001)
model_bert.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

In [25]:
# Train the model on GloVe embeddings
history_glove = model_bert.fit(train_data_bert, train_labels, batch_size=128, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [26]:
# Evaluate the model on test set
y_pred_bert = model_bert.predict(test_data_bert)
y_pred_bert = np.round(y_pred_bert).astype(int)



In [27]:
metrics["bert"]["accuracy"].append(round(accuracy_score(test_labels, y_pred_bert), 3))
metrics["bert"]["f1-score"].append(round(f1_score(test_labels, y_pred_bert), 3))
metrics["bert"]["AUC of ROC"].append(round(roc_auc_score(test_labels, y_pred_bert), 3))
metrics["bert"]["EER"].append(round(compute_eer(test_labels, y_pred_bert)[0], 3))

### Logistic Regression

In [29]:
# Train logistic regression model on BERT features
bert_model = LogisticRegression(max_iter=1000)
bert_model.fit(train_data_bert, train_labels)
bert_preds = bert_model.predict(test_data_bert)

In [30]:
metrics["bert"]["accuracy"].append(round(accuracy_score(test_labels, bert_preds), 3))
metrics["bert"]["f1-score"].append(round(f1_score(test_labels, bert_preds), 3))
metrics["bert"]["AUC of ROC"].append(round(roc_auc_score(test_labels, bert_preds), 3))
metrics["bert"]["EER"].append(round(compute_eer(test_labels, bert_preds)[0], 3))

In [44]:
table = []

for embed in ["glove", "bert"]:
    for model in ["NN", "LR"]:
        item = [embed, model]
        if model == "NN":
            i = 0
        else:
            i = 1
        for metric in metric_list:
            m = metrics[embed][metric][i]
            item.extend([m])
        table.append(item)

print(tabulate(table, headers=["Embedding", "Model", "Accuracy", "F1 - Score", "AUC of ROC", "EER"], tablefmt="fancy_grid"))

╒═════════════╤═════════╤════════════╤══════════════╤══════════════╤═══════╕
│ Embedding   │ Model   │   Accuracy │   F1 - Score │   AUC of ROC │   EER │
╞═════════════╪═════════╪════════════╪══════════════╪══════════════╪═══════╡
│ glove       │ NN      │      0.837 │        0.838 │        0.837 │ 0.163 │
├─────────────┼─────────┼────────────┼──────────────┼──────────────┼───────┤
│ glove       │ LR      │      0.832 │        0.832 │        0.832 │ 0.168 │
├─────────────┼─────────┼────────────┼──────────────┼──────────────┼───────┤
│ bert        │ NN      │      0.848 │        0.848 │        0.848 │ 0.152 │
├─────────────┼─────────┼────────────┼──────────────┼──────────────┼───────┤
│ bert        │ LR      │      0.851 │        0.852 │        0.851 │ 0.149 │
╘═════════════╧═════════╧════════════╧══════════════╧══════════════╧═══════╛


In [45]:
print("Glove Embedding:")
print("Neural Network Metrics:")
print("Accuracy:", metrics["glove"]["accuracy"][0])
print("F1 - Score:", metrics["glove"]["f1-score"][0])
print("AUC of ROC:", metrics["glove"]["AUC of ROC"][0])
print("EER:", metrics["glove"]["EER"][0])
print("Logistic Regression Metrics:")
print("Accuracy:", metrics["glove"]["accuracy"][1])
print("F1 - Score:", metrics["glove"]["f1-score"][1])
print("AUC of ROC:", metrics["glove"]["AUC of ROC"][1])
print("EER:", metrics["glove"]["EER"][1])

print("BERT Embedding:")
print("Neural Network Metrics:")
print("Accuracy:", metrics["bert"]["accuracy"][0])
print("F1 - Score:", metrics["bert"]["f1-score"][0])
print("AUC of ROC:", metrics["bert"]["AUC of ROC"][0])
print("EER:", metrics["bert"]["EER"][0])
print("Logistic Regression Metrics:")
print("Accuracy:", metrics["bert"]["accuracy"][1])
print("F1 - Score:", metrics["bert"]["f1-score"][1])
print("AUC of ROC:", metrics["bert"]["AUC of ROC"][1])
print("EER:", metrics["bert"]["EER"][1])

Glove Embedding:
Neural Network Metrics:
Accuracy: 0.837
F1 - Score: 0.838
AUC of ROC: 0.837
EER: 0.163
Logistic Regression Metrics:
Accuracy: 0.832
F1 - Score: 0.832
AUC of ROC: 0.832
EER: 0.168
BERT Embedding:
Neural Network Metrics:
Accuracy: 0.848
F1 - Score: 0.848
AUC of ROC: 0.848
EER: 0.152
Logistic Regression Metrics:
Accuracy: 0.851
F1 - Score: 0.852
AUC of ROC: 0.851
EER: 0.149
