In [1]:
import numpy as np
import random
import torch
import os
import spacy
from torchtext.vocab import GloVe, FastText
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import csv

pos_set = []
neg_set = []
with open("./Train dataset.csv", encoding='utf-8') as csvf:
    data = csv.DictReader(csvf)

    for rows in data:

        # Removing punctuations
        review = rows['review'].replace('<br />', " ", -1)

        if rows['sentiment'] == 'positive':
            pos_set.append(review)
        else:
            neg_set.append(review)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed = 42):
    '''
        For Reproducibility: Sets the seed of the entire notebook.
    '''

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    # Sets a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(1)

In [3]:
print(len(pos_set), len(neg_set))

19885 20115


In [4]:
from torchtext.data import get_tokenizer

# Downloads GloVe and FastText
global_vectors = GloVe(name='840B', dim=300)

# ----------- Text Preprocessing -----------
nlp = spacy.load("en_core_web_md")

data_set = []
vocab = []

chars_to_remove = ['--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '.', ',', '<br />']

tokenizer = get_tokenizer("basic_english")

for line in pos_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 1))
    # Adds the extracted words to a list
    vocab.extend(tokens)


print("--- Positive Finished ---")

for line in neg_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 0))
    # Adds the extracted words to a list
    vocab.extend(tokens)


--- Positive Finished ---


In [5]:
#len(set(vocab))

# Stores all the unique words in the dataset and their frequencies
vocabulary = {}

# Calculates the frequency of each unique word in the vocabulary
for word in vocab:
    if word in vocabulary:
        vocabulary[word] += 1
    else:
        vocabulary[word] = 1

print("Number of unique words in the vocabulary: ", len(vocabulary))

Number of unique words in the vocabulary:  130959


In [6]:
# Stores the integer token for each unique word in the vocabulary
ids_vocab = {}

id = 0

# Assigns words in the vocabulary to integer tokens
for word, v in vocabulary.items():
    ids_vocab[word] = id
    id += 1

In [11]:
# Tokenization function
def tokenize(corpus, ids_vocab):
    """
        Converts words in the dataset to integer tokens
    """

    tokenized_corpus = []
    for line, sentiment in corpus:
        new_line = []
        for i, word in enumerate(line):
            if word in ids_vocab and (i == 0 or word != line[i-1]):
                new_line.append(ids_vocab[word])

        new_line = torch.Tensor(new_line).long()
        tokenized_corpus.append((new_line, sentiment))

    return tokenized_corpus

In [12]:
token_corpus = tokenize(data_set, ids_vocab)

In [18]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

# Creating the vocabulary from the tokens of words
counter = Counter(vocabulary)

"""
counter_filtered = {}

for k, v in counter.items():
    if v > 3:
        counter_filtered[k] = v
"""
        
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# Adding <unk> token and default index
unk_token = '<unk>'

# Making default index same as index of unk_token
default_index = 0
v2 = Vocab(ordered_dict, specials=[unk_token])
v2.set_default_index(default_index)

print("Number of words in Vocabulary: ", v2.__len__())

ImportError: cannot import name 'vocab' from 'torchtext.vocab' (/raid/home/kawinm/miniconda3/lib/python3.9/site-packages/torchtext/vocab.py)

In [13]:
emb_dim = 300

embeds = torch.zeros(len(ids_vocab) + 1, emb_dim)

for token, idx in ids_vocab.items():
    embeds[idx] = global_vectors[token]

In [17]:
# Train-Valid split of 95-05
def split_indices(n, val_pct):

    # Determine size of Validation set
    n_val = int(val_pct * n)

    # Create random permutation of 0 to n-1
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

train_pos_indices, val_pos_indices = split_indices(len(pos_set), 0.2)
train_neg_indices, val_neg_indices = split_indices(len(neg_set), 0.2)

train_indices = np.concatenate((train_pos_indices, train_neg_indices+len(pos_set)-1))
val_indices = np.concatenate((val_pos_indices, val_neg_indices+len(pos_set)-1))

from torch.nn.utils.rnn import pad_sequence

# ----------- Batching the data -----------
def collate_fn(instn):

    sentence = [x[0] for x in instn]
    sentence = pad_sequence(sentence, batch_first=True, padding_value=0)

    labels = torch.Tensor([x[1] for x in instn])

    return (sentence, labels)


batch_size = 128

train_sampler   = SubsetRandomSampler(train_indices)
train_loader    = DataLoader(token_corpus, batch_size, sampler=train_sampler, collate_fn=collate_fn)

val_sampler     = SubsetRandomSampler(val_indices)
val_loader      = DataLoader(token_corpus, batch_size, sampler=val_sampler, collate_fn=collate_fn)

for i in train_loader:
    print(i)
    break


(tensor([[20677,  1553,    25,  ...,     0,     0,     0],
        [  119,   974,    51,  ...,     0,     0,     0],
        [  119,   446,   321,  ...,     0,     0,     0],
        ...,
        [  787,    36,    51,  ...,     0,     0,     0],
        [  119,  3966,   143,  ...,     0,     0,     0],
        [    2,    32,   278,  ...,     0,     0,     0]]), tensor([0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0.,
        1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
        1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
        1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
        0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1.,
        1., 0.]))


In [21]:
# ----------- Model -----------
class BILSTM(nn.Module):
    
    def __init__(self, embeds):
        super().__init__()

        self.embeddings = nn.Embedding.from_pretrained(embeds, padding_idx=0)

        self.lstm = nn.LSTM(input_size = 300, hidden_size = 200, num_layers =1, batch_first = True, bidirectional = True)

        self.dropout = nn.Dropout(0.25)
        self.lin1 = nn.Linear(400, 100)
        self.lin2 = nn.Linear(100, 1)

    def forward(self, xb, tsne = False):

        x = self.embeddings(xb)
        x, y = self.lstm(x)
        x = torch.cat((y[0][0, :, :], y[0][1, :, :]), dim = 1)
        x = x.squeeze(dim=0)
        x = self.lin1(x)

        if tsne == True:
            return x 

        x = F.relu(x)
        x = self.dropout(x)
        x = self.lin2(x)
        x = torch.sigmoid(x)
        return x

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")


model = BILSTM(embeds)
model.to(device)
opt_c = torch.optim.Adagrad(model.parameters(), lr = 0.001)
# loss_fn_c = F.cross_entropy - Tried Cross Entropy with log_softmax output function - gave similar results
loss_fn_c = F.binary_cross_entropy


In [22]:
# ----------- Main Training Loop -----------
max_epoch = 25

best_test_acc = 0
for ep in range(max_epoch):

    epoch_loss = 0

    model.train()

    for xb, yb in tqdm(train_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn_c(y_hat.squeeze(), yb)

        loss.backward()

        opt_c.step()

        opt_c.zero_grad()

        epoch_loss += loss

    print("Epoch: ", ep+1, " Training Loss: ", epoch_loss.item())


    #----------- Validation -----------

    val_labels = []
    val_pred = []

    model.eval()

    val_epoch_loss = 0

    for xb, yb in tqdm(val_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn_c(y_hat.squeeze(), yb)

        val_epoch_loss += loss

        val_labels.extend(torch.round(yb).cpu().detach().numpy())
        val_pred.extend(y_hat.argmax(dim=1).cpu().detach().numpy())


    print("Validation loss: ", val_epoch_loss.item())
    print("Validation accuracy: ", accuracy_score(val_labels, val_pred)*100)


    if ep > 15 and prev_val_loss - val_epoch_loss.item() > 0.05:
        print("Saving Model")
        torch.save(model.state_dict(), "best_model.pt")
    
    prev_val_loss = val_epoch_loss.item()
    

100%|██████████| 250/250 [00:24<00:00, 10.18it/s]


Epoch:  1  Training Loss:  145.12107849121094


 76%|███████▌  | 48/63 [00:02<00:00, 20.54it/s]


RuntimeError: CUDA out of memory. Tried to allocate 1.29 GiB (GPU 0; 39.59 GiB total capacity; 19.48 GiB already allocated; 170.62 MiB free; 21.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
#https://pytorch.org/docs/stable/notes/faq.html#my-model-reports-cuda-runtime-error-2-out-of-memory