# Sentiment Analysis - NLU

## Version: **GRU - Gated Recurrent Unit**
Student: Francesco Laiti

---

This notebook contains the source code to build, train and evaluate a GRU-based sentiment analysis model using the PyTorch library.

## Pre requirements

Define the requirements to run correctly the notebook and load properly the datasets.

In [None]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import nltk
from sklearn.metrics import f1_score, accuracy_score
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

import numpy 
from tqdm import tqdm
import yaml
import os
from collections import Counter
import random

import wandb
wandb.login()

nltk.download('movie_reviews')
nltk.download('subjectivity')
nltk.download('stopwords')

Declare the global constants used in this notebook.

In [3]:
# dataset
BATCH_SIZE_SUBJ = 4096
BATCH_SIZE_POL = 512
PAD_TOKEN = 0
REMOVE_STOPWORDS = False
N_SPLIT = 5 # default value of Stratified K-Fold
K_FOLD = 0 # random.randint(0,4) to pick up a random fold of Stratified K-Fold

# pre-trained word embedding
URL_GLOVE_TOKENS = '6B' # 42B , 840B, twitter.27B 

# GRU configuration
OUT_SIZE = 2
HID_SIZE = 128
EMB_SIZE = 300
N_LAYER = 2
DROPOUT = 0.5

# training
EPOCHS = 30
OPTIMIZER = 'Adam' 
LR = 0.001
LR_EMBEDDING = 0.01
PATIENCE = 7
N_AFTER_COMMA = 3

# env
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# wandb settings
PROJECT_NAME = "sentiment-analysis-nlu"
ENTITY = "laitifrancesco"
MODE_WANDB = "disabled" # "online"

# paths
WEIGHTS_PATH = 'weights/rnn/'
HEATMAP_PATH = 'heatmap/'

# for reproducibility
RANDOM_STATE = 42
g = torch.Generator().manual_seed(0)

### Utility functions

In [4]:
def get_complete_saved_path(weights_path, config):
    weights_path += config.dataset
    if not os.path.exists(weights_path):
        os.makedirs(weights_path, exist_ok=True)
    weights_path += '/' + config.model.lower() + '.pt'
    return weights_path 

## Dataset & Dataloader

The datasets used in this notebook are ``subjectivity`` and ``movie_reviews``, both imported from NLTK library.

We create a stratified k-fold validator. To access to different fold (in this case a 5-fold), simply change the ``K_FOLD`` parameter.

In [5]:
from sklearn.model_selection import StratifiedKFold

def make_stratified_k_fold(corpus, labels):
    cv = StratifiedKFold(n_splits=N_SPLIT, shuffle=True, random_state=RANDOM_STATE)
    fold = list(cv.split(corpus, labels))

    return fold

def pick_k_fold(data, labels, fold, k_fold = 0):
    fold = fold[k_fold]
    dataset = numpy.array(data, dtype=numpy.object0)
    labels = numpy.array(labels)

    X_train = dataset[fold[0]]
    y_train = labels[fold[0]]

    X_test = dataset[fold[1]]
    y_test = labels[fold[1]]

    return X_train, X_test, y_train, y_test

In [6]:
from torchtext.vocab import GloVe

class Tokenizer():
    def __init__(self, corpus, word2id = None, name_dataset = '', build_pretrained = False):
        self.name_dataset = name_dataset
        self.corpus2id, self.word2id = self._tokenizer(corpus, word2id)
        self.pretrained_word2id = None
        if build_pretrained:
            self.pretrained_word2id = self._build_vocab()

    # Adapted code from the lab
    # Map sequences to number
    def _mapping_list_of_word(self, data, mapper): 
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq:
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper['unk'])
            res.append(tmp_seq)
        return res

    def _mapping_list_of_list(self, data, mapper):
        res = []
        for doc in data:
            tmp_seq = []
            for sent in doc:
                for x in sent:
                    if x in mapper:
                        tmp_seq.append(mapper[x])
                    else:
                        tmp_seq.append(mapper['unk'])
            res.append(tmp_seq)
        return res

    # Adapted code from the lab
    def _w2id(self, elements, unk=True):
        vocab = {'pad': PAD_TOKEN}
        if unk:
            vocab['unk'] = len(vocab)
        count = Counter(elements)
        for k, v in count.items():
            vocab[k] = len(vocab)
        return vocab

    def _tokenizer(self, data, word2id):
        if self.name_dataset == 'subjectivity':
            if word2id is None: word2id = self._w2id([w for sent in data for w in sent])
            corpus2id = self._mapping_list_of_word(data, word2id)
        elif self.name_dataset == 'polarity': 
            if word2id is None: word2id = self._w2id([w for doc in data for sent in doc for w in sent])
            corpus2id = self._mapping_list_of_list(data, word2id)
        else:
            raise NameError(f'Name of the dataset {self.name_dataset} not valid. Please, choose between subjectivity or polarity.')

        return corpus2id, word2id
    
    # Build matrix for pre-trained word embedding 
    # Reference: https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
    def _build_vocab(self, emb_dim=EMB_SIZE): 
        global_vectors = GloVe(name=URL_GLOVE_TOKENS, dim=emb_dim)
        matrix_len = len(self.word2id)
        weights_matrix = numpy.zeros((matrix_len, emb_dim))

        for i, word in enumerate(self.word2id):
            weights_matrix[i] = global_vectors.get_vecs_by_tokens(word)
            
        return torch.from_numpy(weights_matrix).float() 

In [7]:
class SA_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels    
    
    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = torch.tensor(self.tokens[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
# Reference: https://www.codefull.org/2018/11/use-pytorchs-dataloader-with-variable-length-sequences-for-lstm-gru/
def collate_fn(batch):
    sorted_batch = sorted(batch, key=lambda x: x['input_ids'].shape[0], reverse=True) # Sort the batch (a dictionary with 2 keys: 'input_ids' and 'labels')in the descending order
    sequences = [x['input_ids'] for x in sorted_batch] # Get each sequence and pad it
   
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=PAD_TOKEN) # Also need to store the length of each sequence (each tensor). This is later needed in order to unpad the sequences
    
    lengths = torch.LongTensor([len(x) for x in sequences])
    labels = torch.LongTensor([x['labels'] for x in sorted_batch]) # Don't forget to grab the labels of the *sorted* batch

    return sequences_padded, lengths, labels

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def prepare_polarity_data(k_fold = 0):
    neg = movie_reviews.paras(categories='neg')
    pos = movie_reviews.paras(categories='pos')

    corpus = neg + pos
    labels = numpy.array([0] * len(neg) + [1] * len(pos)) # "0": "NEGATIVE", "1": "POSITIVE"
    
    if REMOVE_STOPWORDS:
        res = []
        for doc in corpus:
            tmp_sent = []
            for sent in doc:
                tmp_seq = []
                for x in sent:
                    if x not in stop_words:
                        tmp_seq.append(x)
                tmp_sent.append(tmp_seq)
            res.append(tmp_sent)
        corpus = res
        del res
    
    fold_pols = make_stratified_k_fold(corpus, labels)

    X_train, X_test, y_train, y_test = pick_k_fold(corpus, labels, fold_pols, k_fold)
    X_train = Tokenizer(X_train, build_pretrained=True, name_dataset='polarity')
    X_test = Tokenizer(X_test, X_train.word2id, name_dataset= 'polarity')

    train_data = SA_Dataset(X_train.corpus2id, y_train)
    test_data = SA_Dataset(X_test.corpus2id, y_test)
    
    train_dataloader = torch.utils.data.DataLoader(train_data, BATCH_SIZE_POL, collate_fn=collate_fn, shuffle=True, generator=g)
    test_dataloader = torch.utils.data.DataLoader(test_data, BATCH_SIZE_POL, collate_fn=collate_fn, shuffle=True, generator=g)

    print(f'Polarity [K_FOLD = {k_fold}] data loaded. Remove stop word: {REMOVE_STOPWORDS}')
    return train_dataloader, test_dataloader, X_train

def prepare_subjectivity_data(k_fold = 0):
    subj = subjectivity.sents(categories='subj')
    obj = subjectivity.sents(categories='obj')

    corpus = subj + obj
    labels = numpy.array([0] * len(subj) + [1] * len(obj))  # "0": "SUBJECTIVE", "1": "NEUTRAL"

    if REMOVE_STOPWORDS:
        res = []
        for sent in corpus:
            tmp_seq = []
            for x in sent:
                if x not in stop_words:
                    tmp_seq.append(x)
            res.append(tmp_seq)
        corpus = res
        del res
    
    fold_subj = make_stratified_k_fold(corpus, labels)

    X_train, X_test, y_train, y_test = pick_k_fold(corpus, labels, fold_subj, k_fold)
    X_train = Tokenizer(X_train, build_pretrained=True, name_dataset='subjectivity')
    X_test = Tokenizer(X_test, X_train.word2id, name_dataset= 'subjectivity')

    train_data = SA_Dataset(X_train.corpus2id, y_train)
    test_data = SA_Dataset(X_test.corpus2id, y_test)

    train_dataloader = torch.utils.data.DataLoader(train_data, BATCH_SIZE_SUBJ, collate_fn=collate_fn, shuffle=True, generator=g)
    test_dataloader = torch.utils.data.DataLoader(test_data, BATCH_SIZE_SUBJ, collate_fn=collate_fn, shuffle=True, generator=g)

    print(f'Subjectivity [K_FOLD = {k_fold}] data loaded. Remove stop word: {REMOVE_STOPWORDS}')
    return train_dataloader, test_dataloader, X_train

## Train & Evaluation

In [101]:
class BiGRU(nn.Module):
    def __init__(self, out_size, hid_size, emb_size, no_layers, vocab_len, dropout_prob, attention = True, weights_mx_embed = None, use_pretrained_embed = False, pad_index = PAD_TOKEN):
        super(BiGRU, self).__init__()
        
        # embedding
        self.embedding = nn.Embedding(vocab_len, emb_size, padding_idx=pad_index)
        if use_pretrained_embed:
            if weights_mx_embed is not None:
                self.embedding.weight = nn.Parameter(weights_mx_embed)
            else:
                raise Exception('Weights matrix for the embedding layer is not valid.')
            if LR_EMBEDDING == 0.:
                self.embedding.weight.requires_grad = False

        # GRU
        self.gru = nn.GRU(input_size=emb_size,hidden_size=hid_size,num_layers=no_layers, batch_first=True, bidirectional=True)

        # attention
        self.attention = None
        if attention:
            self.attention = nn.Linear(hid_size*2, 1)

        # droput layer
        self.dropout = nn.Dropout(dropout_prob)

        # linear layer
        self.fc = nn.Linear(hid_size*2, out_size) # x2 because we are dealing with a bidirectional model
    
    def forward(self, x, x_lengths):
        # embedding 
        embeds = self.embedding(x) # shape: Batch x Sequence x Feature since batch = True
        packed_input = pack_padded_sequence(embeds, x_lengths.cpu().numpy(), batch_first=True)

        # gru
        packed_out, last_hidden = self.gru(packed_input)
        alpha, input_sizes = pad_packed_sequence(packed_out, batch_first=True)

        # attention
        if self.attention is not None:
            # Compute attention weights
            attention_weights = self.dropout(alpha)
            attention_weights = torch.softmax(self.attention(attention_weights), dim=1)
            alpha = alpha * attention_weights

        # dropout and fully connected layer
        context_vector = alpha.sum(dim=1)
        out = self.dropout(context_vector)
        out = self.fc(out)

        return out

    def _attention_heatmap(self, x, x_lengths):
        embeds = self.embedding(x)
        packed_input = pack_padded_sequence(embeds, x_lengths.cpu().numpy(), batch_first=True)
        packed_out, last_hidden = self.gru(packed_input)
        pack_encoded, input_sizes = pad_packed_sequence(packed_out, batch_first=True)
        attention_weights = self.dropout(pack_encoded)
        attention_weights = self.attention(attention_weights)

        return attention_weights

In [11]:
# Adapted code from the lab
def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [12]:
def get_optimizer(model, lr_embedding = None):
    if lr_embedding is not None:
        embedding_weights = []
        rest_of_the_net_weights = []
        # iterate through the layers of the network
        for name, param in model.named_parameters():
            # separate embedding layers to the other layers
            if name.startswith('embedding'):
                embedding_weights.append(param)
            else:
                rest_of_the_net_weights.append(param)
        lr_specs = [{'params': rest_of_the_net_weights},
                    {'params': embedding_weights, 'lr': lr_embedding}]

        opt = torch.optim.Adam(lr_specs, lr=LR)
    else:
        opt = torch.optim.Adam(model.parameters(), lr=LR)
    return opt

In [13]:
def get_loss():
    return torch.nn.CrossEntropyLoss()

In [14]:
def training_step(model, optimizer, train_loader):
    model.train()

    cumulative_loss = 0.
    cumulative_accuracy = 0.
    cumulative_f1 = 0.

    cost_function = get_loss()

    for batch in train_loader:
        x, x_lengths, labels = batch
        x = x.to(DEVICE)
        x_lengths = x_lengths.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        outputs = model(x, x_lengths)
        
        loss = cost_function(outputs, labels)
        loss.backward()
        optimizer.step()
        
        predictions = torch.nn.functional.softmax(outputs, dim=1)
        pred_labels = torch.argmax(predictions, dim=1)

        cumulative_loss += loss.item()
        cumulative_accuracy += accuracy_score(labels.cpu().detach().numpy(), pred_labels.cpu().detach().numpy())
        cumulative_f1 += f1_score(labels.cpu().detach().numpy(), pred_labels.cpu().detach().numpy())

    return {"train/train_acc":(cumulative_accuracy/len(train_loader)), 
            "train/train_loss": cumulative_loss/len(train_loader),
            "train/train_f1": cumulative_f1/len(train_loader)}

In [15]:
def evaluating_step(model, test_loader):
    model.eval()
    
    cumulative_loss = 0.
    cumulative_accuracy = 0.
    cumulative_f1 = 0.

    cost_function = get_loss()

    with torch.no_grad():
        for batch in test_loader:
            x, x_lengths, labels = batch
            x = x.to(DEVICE)
            x_lengths = x_lengths.to(DEVICE)
            labels = labels.to(DEVICE)

            outputs = model(x, x_lengths)

            loss = cost_function(outputs, labels)
            
            predictions = torch.nn.functional.softmax(outputs, dim=1)
            pred_labels = torch.argmax(predictions, dim=1)

            cumulative_loss += loss.item()
            cumulative_accuracy += accuracy_score(labels.cpu().detach().numpy(), pred_labels.cpu().detach().numpy())
            cumulative_f1 += f1_score(labels.cpu().detach().numpy(), pred_labels.cpu().detach().numpy())
    
    return {"test/test_acc": (cumulative_accuracy/len(test_loader)), 
            "test/test_loss": cumulative_loss/len(test_loader), 
            "test/test_f1": cumulative_f1/len(test_loader) }

In [16]:
def training_loop(wandb_run, train_data, test_data, vocab_len, weights_mx_embed):

    config = wandb_run.config
    print('CONFIGS\n', yaml.dump(config._items, default_flow_style=False)) # pretty print of configs used

    run_epochs = 0
    best_acc = 0.
    best_loss = 0.
    best_f1 = 0.
    patience = PATIENCE

    model = BiGRU(OUT_SIZE, config.hidden_size, config.embedding_size, config.n_layers, vocab_len, config.dropout_prob, config.attention, weights_mx_embed, config.pretrained_embedding)
    model.apply(init_weights)
    optimizer = get_optimizer(model, lr_embedding=config.lr_embedding)
    weights_path = get_complete_saved_path(WEIGHTS_PATH, config)
    model.to(DEVICE)

    for e in range(config.epochs):
        print(f'-- Epoch [{e+1}/{config.epochs}] --')
        train_metrics = training_step(model, optimizer, train_data)
        test_metrics = evaluating_step(model, test_data)
        wandb.log({**train_metrics, **test_metrics})
        print(f'Train -> \tLoss:{train_metrics["train/train_loss"]:.5f} \tAccuracy: {train_metrics["train/train_acc"]:.2f} \tF1-Score: {train_metrics["train/train_f1"]:.2f}')
        print(f'Test -> \tLoss:{test_metrics["test/test_loss"]:.5f} \tAccuracy: {test_metrics["test/test_acc"]:.2f} \tF1-Score: {test_metrics["test/test_f1"]:.2f}')

        if (best_acc < test_metrics["test/test_acc"]):
            torch.save(model.state_dict(), weights_path)
            best_acc = test_metrics["test/test_acc"]
            best_loss = test_metrics["test/test_loss"]
            best_f1 = test_metrics["test/test_f1"]
        else: patience -= 1
        
        if patience < 0: # Early stopping with patience
            run_epochs = e+1
            print(f'Early stopping with PATIENCE = {PATIENCE}. Model trained for {run_epochs}/{config.epochs} epochs')
            break # Not nice but it keeps the code clean
            
    print('Model saved in location ', weights_path)
    wandb.summary["test_best_acc"] = best_acc
    wandb.summary["test_best_loss"] = best_loss
    wandb.summary["test_best_f1"] = best_f1
    wandb.summary["real_run_epochs"] = run_epochs
    wandb.finish()

    return best_acc, best_f1

## Train subjectivity classifier

We now train and evaluate a subjectivity detector.

In [None]:
config={
        "dataset": "subjectivity",
        "model": "GRU",
        "batch_size": BATCH_SIZE_SUBJ,
        "epochs": EPOCHS,
        "lr": LR,
        "lr_embedding": LR_EMBEDDING,
        "optimizer": OPTIMIZER,
        "hidden_size": HID_SIZE,
        "embedding_size": EMB_SIZE,
        "n_layers": N_LAYER,
        "dropout_prob": DROPOUT,
        "attention": True,
        "pretrained_embedding": True,
        "remove_stopwords": REMOVE_STOPWORDS
    }

NAME_RUN = "subj-GRU-"

train_subj, test_subj, train_info_subj = prepare_subjectivity_data(k_fold=K_FOLD)
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, name=NAME_RUN + str(K_FOLD), config=config, mode=MODE_WANDB)
training_loop(run, train_subj, test_subj, len(train_info_subj.word2id), train_info_subj.pretrained_word2id) 

## Train no-filter sents polarity classifier

We now train and evaluate a polarity classifier **without** removing subjective sentences from movie reviews.

In [None]:
config={
        "dataset": "polarity",
        "version": "no-filter",
        "model": "GRU-no-filter",
        "batch_size": BATCH_SIZE_POL,
        "epochs": EPOCHS,
        "lr": LR,
        "lr_embedding": LR_EMBEDDING,
        "optimizer": OPTIMIZER,
        "hidden_size": HID_SIZE,
        "embedding_size": EMB_SIZE,
        "n_layers": N_LAYER,
        "dropout_prob": DROPOUT,
        "attention": True,
        "pretrained_embedding": True,
        "remove_stopwords": REMOVE_STOPWORDS
    }

NAME_RUN = "pol-no-filter-GRU-pretrained-"

train_pols, test_pols, train_info_pols = prepare_polarity_data(k_fold=K_FOLD)
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, name=NAME_RUN + str(K_FOLD), config=config , mode=MODE_WANDB)
training_loop(run, train_pols, test_pols, len(train_info_pols.word2id), train_info_pols.pretrained_word2id)

## Filter sentences using subjectivity classifier

We now use the subjectivity detector to remove the subjective sentences from the movie reviews dataset to hopefully obtain better quality data to process.

In [142]:
import pandas

def remove_subj_sents(data, label, classifier):
    filtered = []

    len_doc = []
    len_subj = []
    len_sent_removed = []

    for doc in tqdm(data):
        pol_tokens = Tokenizer(doc, train_info_subj.word2id, name_dataset='subjectivity') # trated as a subjectivity dataset because at the end has small number word per sentences
        polarity_data = SA_Dataset(pol_tokens.corpus2id, numpy.array([label]*len(doc)))
        polarity_dl = torch.utils.data.DataLoader(polarity_data, BATCH_SIZE_SUBJ, collate_fn=collate_fn, shuffle=False, generator=g)

        classify_labels = filter_step(classifier, polarity_dl)
        sents = [d for d, estimate in zip(doc, classify_labels) if estimate == 0] # 0 = subj

        if len(sents) > 0:
            filtered.append(sents)

        len_doc.append(len(doc))
        len_subj.append(len(sents))
        len_sent_removed.append(len(doc)-len(sents))

    df = pandas.DataFrame({'doc':len_doc, 'subj-sents':len_subj, 'sents-removed':len_sent_removed})
    df.to_csv(f'{random.randint(0,1000)}.csv', index=False)

    return filtered

def filter_step(model, filtered_loader):
    model.eval()

    sbj_labels = []
    with torch.no_grad():
        for batch in filtered_loader:
            x, x_lengths, labels = batch
            x = x.to(DEVICE)
            x_lengths = x_lengths.to(DEVICE)

            outputs = model(x, x_lengths)
            
            predictions = torch.nn.functional.softmax(outputs, dim=1)
            pred_labels = torch.argmax(predictions, dim=1)

            sbj_labels += pred_labels
    
    return sbj_labels

def filter_sentences():
    classifier = BiGRU(OUT_SIZE, HID_SIZE, EMB_SIZE, N_LAYER, len(train_info_subj.word2id), DROPOUT)
    classifier.load_state_dict(torch.load(WEIGHTS_PATH + 'subjectivity/' + 'gru.pt'))
    classifier.to(DEVICE)

    neg = movie_reviews.paras(categories='neg')
    pos = movie_reviews.paras(categories='pos')
    
    neg_sents = remove_subj_sents(neg, 0, classifier)
    pos_sents = remove_subj_sents(pos, 1, classifier)
    
    return neg_sents, pos_sents

In [43]:
def prepare_polarity_filtered_data(filter_dict, fold, k_fold):
    corpus = filter_dict['corpus']
    labels = filter_dict['labels']

    if REMOVE_STOPWORDS:
        res = []
        for doc in corpus:
            tmp_sent = []
            for sent in doc:
                tmp_seq = []
                for x in sent:
                    if x not in stop_words:
                        tmp_seq.append(x)
                tmp_sent.append(tmp_seq)
            res.append(tmp_sent)
        corpus = res
        del res
        
    X_train, X_test, y_train, y_test = pick_k_fold(corpus, labels, fold, k_fold)
    X_train = Tokenizer(X_train, build_pretrained=True, name_dataset='polarity')
    X_test = Tokenizer(X_test, X_train.word2id, name_dataset= 'polarity')

    train_data = SA_Dataset(X_train.corpus2id, y_train)
    test_data = SA_Dataset(X_test.corpus2id, y_test)
    
    train_dataloader = torch.utils.data.DataLoader(train_data, BATCH_SIZE_POL, collate_fn=collate_fn, shuffle=True, generator=g)
    test_dataloader = torch.utils.data.DataLoader(test_data, BATCH_SIZE_POL, collate_fn=collate_fn, shuffle=False, generator=g)

    print(f'Filtered polarity [K_FOLD = {k_fold}] data loaded. Remove stop word: {REMOVE_STOPWORDS}')
    return train_dataloader, test_dataloader, X_train

We filter out the subjective sentences from the ```movie_reviews``` dataset and save them in a ```.pkl``` file for an easy access.

In [44]:
import pickle

filtered_saved_path = WEIGHTS_PATH + 'filtered_polarity_sents.pkl'
dict_pols_filtered = {}

if not os.path.exists(filtered_saved_path):
    print('Creating .pkl with filtered sentences')

    neg_filtered, pos_filtered = filter_sentences()
    dict_pols_filtered = {'corpus': neg_filtered + pos_filtered, 'labels': numpy.array([0] * len(neg_filtered) + [1] * len(pos_filtered))}

    with open(filtered_saved_path, 'wb') as f:
        pickle.dump(dict_pols_filtered, f)
        print('Saved at location ', filtered_saved_path)

else:
    print('Using .pkl with filtered sentences from ', filtered_saved_path)
    with open(filtered_saved_path, 'rb') as f:
        dict_pols_filtered = pickle.load(f)

## Train filter sents polarity classifier

We now train and evaluate a polarity classifier **with** removed subjective sentences from movie reviews.

In [None]:
config={
        "dataset": "polarity",
        "version": "filter",
        "model": "GRU-filter",
        "batch_size": BATCH_SIZE_POL,
        "epochs": EPOCHS,
        "lr": LR,
        "lr_embedding": LR_EMBEDDING,
        "optimizer": OPTIMIZER,
        "hidden_size": HID_SIZE,
        "embedding_size": EMB_SIZE,
        "n_layers": N_LAYER,
        "dropout_prob": DROPOUT,
        "attention": True,
        "pretrained_embedding": True,
        "remove_stopwords": REMOVE_STOPWORDS
    }

NAME_RUN = "pol-filter-GRU-pretrained-"

fold_filtered_pols = make_stratified_k_fold(dict_pols_filtered['corpus'], dict_pols_filtered['labels'])

train_filtered, test_filtered, train_info_filtered = prepare_polarity_filtered_data(dict_pols_filtered, fold_filtered_pols, K_FOLD)
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, name=NAME_RUN + str(K_FOLD), config=config, mode=MODE_WANDB)
training_loop(run, train_filtered, test_filtered, len(train_info_filtered.word2id), train_info_filtered.pretrained_word2id)

## Text attention heat map generator & visualization

Create and visualize the text attention heat map using two different methods:
- generate a heat map using ```seaborn``` library;
- generate Latex code to visualize the attention based text (adapted the source code from the following GitHub repository: https://github.com/jiesutd/Text-Attention-Heatmap-Visualization)

In [20]:
# Git command: 
# !git clone https://github.com/jiesutd/Text-Attention-Heatmap-Visualization.git

# @Author: Jie Yang
# @Date:   2019-03-29 16:10:23

## convert the text/attention list to latex code, which will further generates the text heatmap based on attention weights.
import numpy as np

latex_special_token = ["!@#$%^&*()"]

def generate(text_list, attention_list, latex_file, color='red', rescale_value = False):
	if rescale_value:
		attention_list = rescale(attention_list)
	word_num = len(text_list)
	text_list = clean_word(text_list)
	with open(latex_file,'w') as f:
		f.write(r'''\documentclass[varwidth]{standalone}
				\special{papersize=210mm,297mm}
				\usepackage{color}
				\usepackage{tcolorbox}
				\usepackage{CJK}
				\usepackage{adjustbox}
				\tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
				\begin{document}
				\begin{CJK*}{UTF8}{gbsn}'''+'\n')
		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
		for idx in range(word_num):
			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"} "
		string += "\n}}}"
		f.write(string+'\n')
		f.write(r'''\end{CJK*}
				\end{document}''')

def rescale(input_list):
	the_array = np.asarray(input_list)
	the_max = np.max(the_array)
	the_min = np.min(the_array)
	rescale = (the_array - the_min)/(the_max-the_min)*100
	return rescale.tolist()

def clean_word(word_list):
	new_word_list = []
	for word in word_list:
		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
			if latex_sensitive in word:
				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
		new_word_list.append(word)
	return new_word_list

In [137]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

def subj_generate_heat_map(sent, normalize = True, tex = True, png = True):
    print('SUBJECTIVITY HEATMAP')
    classifier = BiGRU(OUT_SIZE, HID_SIZE, EMB_SIZE, N_LAYER, len(train_info_subj.word2id), DROPOUT)
    classifier.load_state_dict(torch.load(WEIGHTS_PATH + 'subjectivity/' + 'gru.pt'))
    classifier.to(DEVICE)
    classifier.eval()

    ds = Tokenizer([sent], train_info_subj.word2id, build_pretrained=False, name_dataset='subjectivity')
    cd = SA_Dataset(ds.corpus2id, [0])
    dl = torch.utils.data.DataLoader(cd, collate_fn=collate_fn)

    it = dl.__iter__()
    x, x_length, _ = it.__next__()
    x = x.to(DEVICE)

    att = classifier._attention_heatmap(x, x_length)
    att = att.detach().cpu().numpy()
    att = att.squeeze(2).squeeze(0)

    assert len(sent) == len(att)
    subj_hp_path = HEATMAP_PATH + '/subjectivity'

    if not os.path.exists(subj_hp_path):
        os.makedirs(subj_hp_path, exist_ok=True)

    print('Input sent:')
    print(" ".join([w for w in sent]))

    min_val = np.min(att)
    max_val = np.max(att)
    normalized_vector = (att - min_val) / (max_val - min_val)
    
    if tex:
        if normalize:
            generate(sent, normalized_vector*100, subj_hp_path + '/att_visualization_normalize.tex')
        else:
            generate(sent, att*100, subj_hp_path + '/att_visualization.tex')
        print('\nSaved .tex file located in ', subj_hp_path)
    if png:
        if normalize:
            df_data = {'attention weights': att}
            df = pd.DataFrame(df_data, index = sent)
            sns.heatmap(df, annot=False, vmin=min(att), vmax=max(att), square = True)
            plt.show()
        else:
            df_data = {'attention weights': att}
            df = pd.DataFrame(df_data, index = sent)
            sns.heatmap(df, annot=False, vmin=0, vmax=1, square = True)
            plt.show()
            
def pol_generate_heat_map(sent, filter = False, normalize = True, tex = True, png = True):
    print('POLARITY HEATMAP')
    if filter: gru_type = 'gru-filter.pt' 
    else: gru_type = 'gru-no-filter.pt'

    classifier = BiGRU(OUT_SIZE, HID_SIZE, EMB_SIZE, N_LAYER, len(train_info_filtered.word2id), DROPOUT)
    classifier.load_state_dict(torch.load(WEIGHTS_PATH + 'polarity/' + gru_type))
    classifier.to(DEVICE)
    classifier.eval()

    ds = Tokenizer([sent], train_info_filtered.word2id, build_pretrained=False, name_dataset='polarity') #[sent]
    cd = SA_Dataset(ds.corpus2id, [0])
    dl = torch.utils.data.DataLoader(cd, collate_fn=collate_fn)

    iter = dl.__iter__()
    x, x_length, _ = iter.__next__()
    x = x.to(DEVICE)

    att = classifier._attention_heatmap(x, x_length)
    att = att.detach().cpu().numpy()
    att = att.squeeze(2).squeeze(0)

    sent = [w for d in sent for w in d]

    assert len(sent) == len(att)
    pols_hp_path = HEATMAP_PATH + 'polarity'

    if not os.path.exists(pols_hp_path):
        os.makedirs(pols_hp_path, exist_ok=True)

    print('Input sent:')
    print(" ".join([w for w in sent]))

    # calculate quartiles
    q1, q3 = np.percentile(att, [25, 75])
    # calculate IQR
    iqr = q3 - q1
    # calculate lower and upper bounds
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    # remove outliers
    att[(att < lower_bound)] = lower_bound
    att[(att > upper_bound)] = upper_bound
    # normalize vector
    min_val = np.min(att)
    max_val = np.max(att)
    normalized_vector = (att - min_val) / (max_val - min_val)

    if tex:
        if normalize:
            generate(sent, normalized_vector*100, pols_hp_path + '/att_visualization_normalize.tex')
        else:
            generate(sent, att*100, pols_hp_path + '/att_visualization.tex')
        print('\nSaved .tex file located in ', pols_hp_path)
    if png:
        if normalize:
            df_data = {'attention weights': normalized_vector}
            df = pd.DataFrame(df_data, index = sent)
            sns.heatmap(df, annot=False, vmin=min_val, vmax=max_val, square = True)
            plt.show()
        else:
            df_data = {'attention weights': att}
            df = pd.DataFrame(df_data, index = sent)
            sns.heatmap(df, annot=False, vmin=0, vmax=1, square = True)
            plt.show()

In the project report I used as example:
- in Figure 1 the subjective sentence from ``subj[32]``;
- in Figure 2 some non filtered sentences extracted from ``neg[1]``, where we have objective and subjective sentences. 

Choose a sentence from the datasets and run it!

In [None]:
obj = subjectivity.sents(categories='obj')
subj = subjectivity.sents(categories='subj')

neg = movie_reviews.paras(categories='neg')
pos = movie_reviews.paras(categories='pos')

subj_generate_heat_map(subj[32], normalize=True, tex=False)
pol_generate_heat_map(neg[1], filter=True, normalize=True, tex=True) 