In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.model_selection import train_test_split
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# read preprocessed data
train = pd.read_csv("train_sampled_reviews.csv")
val = pd.read_csv("val_sampled_reviews.csv")
test = pd.read_csv("test_sampled_reviews.csv")

In [None]:
train_texts = train['Review'].to_numpy()
train_labels = train['Label'].to_numpy()
val_texts = val['Review'].to_numpy()
val_labels = val['Label'].to_numpy()
test_texts = test['Review'].to_numpy()
test_labels = test['Label'].to_numpy()

In [None]:
def process_tokens(text):
    """
    function to process tokens, replace any unwanted chars
    """
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

def preprocessing(data, tokenizer):
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

nlp = English()
tokenizer = Tokenizer(nlp.vocab)
train_data = preprocessing(train_texts, tokenizer)
val_data = preprocessing(val_texts, tokenizer)
test_data = preprocessing(test_texts, tokenizer)

## Creating a vectorizer to vectorize text and create matrix of features
## Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

## max features - top k words to consider only
max_features = 2000

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

## Checking if the len of vocab = k
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [None]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

y_train = to_categorical(y_train, 2)
y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (24000, 1, 2000), y_train.shape: (24000, 2)


In [None]:
from keras.models import Sequential
from keras.layers import GRU, Dense, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

model_gru = Sequential()
model_gru.add(GRU(64, input_shape=(1, max_features)))
model_gru.add(BatchNormalization())
model_gru.add(Dense(2, activation='softmax'))


optimizer = Adam(learning_rate=0.01)
model_gru.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

gru_checkpoint_path = '/content/drive/MyDrive/advanced_project/models/gru_best_model'
gru_checkpoint = ModelCheckpoint(
    filepath=gru_checkpoint_path,
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)


print(model_gru.summary())


history_gru = model_gru.fit(X_train, y_train,
                            batch_size=256,
                            validation_data=(X_val, y_val),
                            epochs=10,
                            callbacks=[gru_checkpoint])

print(history_gru.history.keys())


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_3 (GRU)                 (None, 64)                396672    
                                                                 
 batch_normalization_1 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dense_3 (Dense)             (None, 2)                 130       
                                                                 
Total params: 397058 (1.51 MB)
Trainable params: 396930 (1.51 MB)
Non-trainable params: 128 (512.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.89806, saving model to /content/drive/MyDrive/advanced_project/models/gru_best_model
Epoch 2/10
Epoch 2: val_accuracy improved from 0.89806 to 0.93389

Evaluation on Test Set

In [None]:
best_gru = load_model(gru_checkpoint_path)
score_gru, acc_gru = best_gru.evaluate(X_test, y_test, verbose=0)
print(f'Test loss for GRU model: {score_gru:.3f}')
print(f'Test accuracy for GRU model: {acc_gru:.3f}')
print()


Test loss for GRU model: 0.837
Test accuracy for GRU model: 0.848



Yelp Dataset

In [None]:
yelp_train = pd.read_csv('yelp_train.csv')
yelp_train.drop(yelp_train.columns[0], axis=1, inplace=True)
yelp_train.columns = ['Label', 'Review']
yelp_train.head()

yelp_val = pd.read_csv('yelp_val.csv')
yelp_val.drop(yelp_val.columns[0], axis=1, inplace=True)
yelp_val.columns = ['Label', 'Review']
yelp_val.head()

yelp_test = pd.read_csv('yelp_test.csv')
yelp_test.drop(yelp_test.columns[0], axis=1, inplace=True)
yelp_test.columns = ['Label', 'Review']
yelp_test.head()



Unnamed: 0,Label,Review
0,1,I visited Le Taj for their lunch buffet and re...
1,0,Have to say this location totally dropped the ...
2,0,I received a recommendation to eat here from t...
3,1,Tried the Triple Bypass Burger with bacon. It ...
4,0,Although the severs and staff are extremely fr...


In [None]:
train_labels, train_texts = yelp_train.values[:,0], yelp_train.values[:,1]
val_labels, val_texts = yelp_val.values[:,0], yelp_val.values[:,1]
test_labels, test_texts = yelp_test.values[:,0], yelp_test.values[:,1]

In [None]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

train_data_y = preprocessing(train_texts, tokenizer)
val_data_y = preprocessing(val_texts, tokenizer)
test_data_y = preprocessing(test_texts, tokenizer)

In [None]:
## max features - top k words to consider only
max_features = 2000

vectorizer_y = Vectorizer(max_features=max_features)
vectorizer_y.fit(train_data_y)

## Checking if the len of vocab = k
X_train_y = vectorizer_y.transform(train_data_y)
X_val_y = vectorizer_y.transform(val_data_y)
X_test_y = vectorizer_y.transform(test_data_y)

y_train_y = np.array(train_labels)
y_val_y = np.array(val_labels)
y_test_y = np.array(test_labels)

vocab_y = vectorizer_y.vocab_list

In [None]:
y_train_y = y_train_y.astype('int')
y_val_y = y_val_y.astype('int')
y_test_y = y_test_y.astype('int')

y_train_y = to_categorical(y_train_y, 2)
y_val_y = to_categorical(y_val_y, 2)
y_test_y = to_categorical(y_test_y, 2)

X_train_y = X_train_y.reshape(-1, 1, X_train_y.shape[1])
X_val_y = X_val_y.reshape(-1, 1, X_val_y.shape[1])
X_test_y = X_test_y.reshape(-1, 1, X_test_y.shape[1])

y_train_y = y_train_y.reshape(-1, 2)
y_val_y = y_val_y.reshape(-1, 2)
y_test_y = y_test_y.reshape(-1, 2)

print(f'X_train_y.shape: {X_train_y.shape}, y_train_y.shape: {y_train_y.shape}')

X_train_y.shape: (26600, 1, 2000), y_train_y.shape: (26600, 2)


In [None]:
best_gru = load_model(gru_checkpoint_path)

score_gru_y, acc_gru_y = best_gru.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for GRU model: {score_gru_y:.3f}')
print(f'Test accuracy for GRU model: {acc_gru_y:.3f}')
print()

Test loss for GRU model: 3.559
Test accuracy for GRU model: 0.524



Fine-turning

In [None]:
model_gru_finetuned = load_model(gru_checkpoint_path)

optimizer = Adam(learning_rate=0.001)

model_gru_finetuned.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

gru_ft_checkpoint_path = '/content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model'
gru_ft_checkpoint = ModelCheckpoint(
    filepath=gru_ft_checkpoint_path,
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

history_gru_ft = model_gru_finetuned.fit(
    X_train_y, y_train_y,
    batch_size=256,
    validation_data=(X_val_y, y_val_y),
    epochs=10,
    callbacks=[gru_ft_checkpoint]
)

# Print history keys
print(history_gru_ft.history.keys())

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.72785, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 2/10
Epoch 2: val_accuracy improved from 0.72785 to 0.81856, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 3/10
Epoch 3: val_accuracy improved from 0.81856 to 0.85155, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 4/10
Epoch 4: val_accuracy improved from 0.85155 to 0.86822, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 5/10
Epoch 5: val_accuracy improved from 0.86822 to 0.87542, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 6/10
Epoch 6: val_accuracy improved from 0.87542 to 0.87998, saving model to /content/drive/MyDrive/advanced_project/models/gru_finetuned_best_model
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.87998
Epoch 8/10


In [None]:
best_gru_ft = load_model(gru_ft_checkpoint_path)

score_gru_ft, acc_gru_ft = best_gru_ft.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for finetuned LSTM model: {score_gru_ft:.3f}')
print(f'Test accuracy for finetuned LSTM model: {acc_gru_ft:.3f}')
print()

Test loss for finetuned LSTM model: 0.346
Test accuracy for finetuned LSTM model: 0.882



In [None]:
model_gru_fe = load_model(gru_checkpoint_path)

model_gru_fe.layers[-1].trainable = False
initial_layer1_weights_values = model_gru_fe.layers[-1].get_weights()

optimizer = Adam(learning_rate=0.001)

model_gru_fe.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

gru_fe_checkpoint_path = '/content/drive/MyDrive/advanced_project/models/gru_fe_best_model'
gru_fe_checkpoint = ModelCheckpoint(
    filepath=gru_fe_checkpoint_path,
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

history_gru_fe = model_gru_fe.fit(
    X_train_y, y_train_y,
    batch_size=256,
    validation_data=(X_val_y, y_val_y),
    epochs=10,
    callbacks=[gru_fe_checkpoint]
)

# Print history keys
print(history_gru_fe.history.keys())

In [None]:
best_gru_fe = load_model(gru_fe_checkpoint_path)

score_gru_fe, acc_gru_fe = best_gru_fe.evaluate(X_test_y, y_test_y, verbose=0)
print(f'Test loss for feature extractor BiLSTM model: {score_gru_fe:.3f}')
print(f'Test accuracy for feature extractor BiLSTM model: {acc_gru_fe:.3f}')
print()

Test loss for feature extractor BiLSTM model: 0.466
Test accuracy for feature extractor BiLSTM model: 0.861

