In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.model_selection import train_test_split
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.layers import GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# read preprocessed data
train = pd.read_csv("./train_sampled_reviews.csv")
val = pd.read_csv("./val_sampled_reviews.csv")
test = pd.read_csv("./test_sampled_reviews.csv")

In [6]:
train_texts = train['Review'].to_numpy()
train_labels = train['Label'].to_numpy()
val_texts = val['Review'].to_numpy()
val_labels = val['Label'].to_numpy()
test_texts = test['Review'].to_numpy()
test_labels = test['Label'].to_numpy()

In [7]:
def process_tokens(text):
    """
    function to process tokens, replace any unwanted chars
    """
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

def preprocessing(data, tokenizer):
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

nlp = English()
tokenizer = Tokenizer(nlp.vocab)
train_data = preprocessing(train_texts, tokenizer)
val_data = preprocessing(val_texts, tokenizer)
test_data = preprocessing(test_texts, tokenizer)

## Creating a vectorizer to vectorize text and create matrix of features
## Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

## max features - top k words to consider only
max_features = 2000

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

## Checking if the len of vocab = k
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [8]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

y_train = to_categorical(y_train, 2)
y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (24000, 1, 2000), y_train.shape: (24000, 2)


In [9]:
model = None
model = Sequential()
model.add(Bidirectional(LSTM(64, input_shape=(1, max_features), dropout=0.4, return_sequences=True)))
model.add(GRU(64))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])

checkpoint_path = '/content/drive/MyDrive/advanced_project/models/bilstm_gru_best_model'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    save_best_only=True,  # Only save the best model
    monitor='val_accuracy',  # Monitor validation accuracy
    mode='max',  # Save the model when validation accuracy improves
    verbose=1  # Print messages about the saving process
)

model.build(input_shape=(X_train.shape))
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=20,
          callbacks=[checkpoint])

print(history.history.keys())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (24000, 1, 128)           1057280   
 al)                                                             
                                                                 
 gru (GRU)                   (24000, 64)               37248     
                                                                 
 dense (Dense)               (24000, 2)                130       
                                                                 
Total params: 1094658 (4.18 MB)
Trainable params: 1094658 (4.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.88806, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_best_model
Epoch 2/20
Epoch 2: val_accuracy improved from 0.88806 to 0.89

In [10]:
best_model = load_model(checkpoint_path)
score, acc = best_model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss for BiLSTM-GRU model: {score:.3f}')
print(f'Test accuracy for BiLSTM-GRU model: {acc:.3f}')

Test loss for BiLSTM-GRU model: 0.414
Test accuracy for BiLSTM-GRU model: 0.857


##Testing on Yelp Review Dataset

In [11]:
yelp_train = pd.read_csv('yelp_train.csv')
yelp_val = pd.read_csv('yelp_val.csv')
yelp_test = pd.read_csv('yelp_test.csv')

yelp_train = yelp_train.drop(['Unnamed: 0'], axis=1)
yelp_val = yelp_val.drop(['Unnamed: 0'], axis=1)
yelp_test = yelp_test.drop(['Unnamed: 0'], axis=1)

yelp_train.head()

Unnamed: 0,Label,Review
0,1,i was a feeling a little abandoned in creepy t...
1,1,The food & atmosphere at Sushi Rock is awesome...
2,0,2.5 stars. Did I come on the wrong day? We ca...
3,1,"Food here is great, had the chicken schwarma w..."
4,1,"I was really craving some Asian noodles, so af..."


In [12]:
train_labels_y, train_texts_y = yelp_train.values[:,0], yelp_train.values[:,1]
val_labels_y, val_texts_y = yelp_val.values[:,0], yelp_val.values[:,1]
test_labels_y, test_texts_y = yelp_test.values[:,0], yelp_test.values[:,1]

In [13]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

train_data_y = preprocessing(train_texts_y, tokenizer)
val_data_y = preprocessing(val_texts_y, tokenizer)
test_data_y = preprocessing(test_texts_y, tokenizer)

In [14]:
## max features - top k words to consider only
max_features = 2000

vectorizer_y = Vectorizer(max_features=max_features)
vectorizer_y.fit(train_data_y)

## Checking if the len of vocab = k
X_train_y = vectorizer_y.transform(train_data_y)
X_val_y = vectorizer_y.transform(val_data_y)
X_test_y = vectorizer_y.transform(test_data_y)

y_train_y = np.array(train_labels_y)
y_val_y = np.array(val_labels_y)
y_test_y = np.array(test_labels_y)

vocab_y = vectorizer_y.vocab_list

In [15]:
y_train_y = y_train_y.astype('int')
y_val_y = y_val_y.astype('int')
y_test_y = y_test_y.astype('int')

y_train_y = to_categorical(y_train_y, 2)
y_val_y = to_categorical(y_val_y, 2)
y_test_y = to_categorical(y_test_y, 2)

X_train_y = X_train_y.reshape(-1, 1, X_train_y.shape[1])
X_val_y = X_val_y.reshape(-1, 1, X_val_y.shape[1])
X_test_y = X_test_y.reshape(-1, 1, X_test_y.shape[1])

y_train_y = y_train_y.reshape(-1, 2)
y_val_y = y_val_y.reshape(-1, 2)
y_test_y = y_test_y.reshape(-1, 2)

print(f'X_train_y.shape: {X_train_y.shape}, y_train_y.shape: {y_train_y.shape}')

X_train_y.shape: (26600, 1, 2000), y_train_y.shape: (26600, 2)


In [16]:
score_yelp, acc_yelp = best_model.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for BiLSTM-GRU model on Yelp dataset: {score_yelp:.3f}')
print(f'Test accuracy for BiLSTM-GRU model on Yelp dataset: {acc_yelp:.3f}')

Test loss for BiLSTM-GRU model on Yelp dataset: 1.569
Test accuracy for BiLSTM-GRU model on Yelp dataset: 0.544


In [17]:
# finetuning model
model_ft = load_model(checkpoint_path)

optimizer = Adam(learning_rate=0.001)

model_ft.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

ft_checkpoint_path = '/content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model'
ft_checkpoint = ModelCheckpoint(
    filepath=ft_checkpoint_path,
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

history_ft = model_ft.fit(
    X_train_y, y_train_y,
    batch_size=256,
    validation_data=(X_val_y, y_val_y),
    epochs=20,
    callbacks=[ft_checkpoint]
)

# Print history keys
print(history_ft.history.keys())

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.61449, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 2/20
Epoch 2: val_accuracy improved from 0.61449 to 0.64994, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 3/20
Epoch 3: val_accuracy improved from 0.64994 to 0.69240, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 4/20
Epoch 4: val_accuracy improved from 0.69240 to 0.73522, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 5/20
Epoch 5: val_accuracy improved from 0.73522 to 0.77680, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 6/20
Epoch 6: val_accuracy improved from 0.77680 to 0.81155, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_finetuned_best_model
Epoch 7/20
Epoch 7: val_accurac

In [18]:
best_model_ft = load_model(ft_checkpoint_path)
score_ft, acc_ft = best_model_ft.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for fine-tuned BiLSTM-GRU model on Yelp dataset: {score_ft:.3f}')
print(f'Test accuracy for fine-tuned BiLSTM-GRU model on Yelp dataset: {acc_ft:.3f}')

Test loss for fine-tuned BiLSTM-GRU model on Yelp dataset: 0.244
Test accuracy for fine-tuned BiLSTM-GRU model on Yelp dataset: 0.898


In [19]:
# Feature extractor approach
model_fe = load_model(checkpoint_path)

for layer in model_fe.layers[:-1]:
  layer.trainable = False

optimizer = Adam(learning_rate=0.001)

model_fe.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

fe_checkpoint_path = '/content/drive/MyDrive/advanced_project/models/bilstm_gru_fe_best_model'
fe_checkpoint = ModelCheckpoint(
    filepath=fe_checkpoint_path,
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

print(model_fe.summary())
history_fe = model_fe.fit(
    X_train_y, y_train_y,
    batch_size=256,
    validation_data=(X_val_y, y_val_y),
    epochs=20,
    callbacks=[fe_checkpoint]
)

# Print history keys
print(history_fe.history.keys())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 1, 128)            1057280   
 al)                                                             
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 1094658 (4.18 MB)
Trainable params: 130 (520.00 Byte)
Non-trainable params: 1094528 (4.18 MB)
_________________________________________________________________
None
Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.52290, saving model to /content/drive/MyDrive/advanced_project/models/bilstm_gru_fe_best_model
Epoch 2/20
Epoch 2: val_accuracy did not improve from 0

In [20]:
best_model_fe = load_model(fe_checkpoint_path)
score_fe, acc_fe = best_model_fe.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for feature extractor BiLSTM-GRU model on Yelp dataset: {score_fe:.3f}')
print(f'Test accuracy for feature extractor BiLSTM-GRU model on Yelp dataset: {acc_fe:.3f}')

Test loss for feature extractor BiLSTM-GRU model on Yelp dataset: 0.677
Test accuracy for feature extractor BiLSTM-GRU model on Yelp dataset: 0.577
