In [7]:
#dataset link: https://www.kaggle.com/datasets/dineshpiyasamara/sentiment-analysis-dataset?resource=download
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import BertModel, BertConfig
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import defaultdict
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

import torch
import re
import numpy as np
import pickle

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
word2id = defaultdict(lambda: len(word2id))
PAD = word2id['<pad>']
UNK = word2id['<unk>']

In [None]:
def return_unk():
    return UNK

In [None]:
def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [None]:
class MOSEI:
    def __init__(self, x, y):

        # place holders for the final train/dev/test dataset
        self.train = train = []

        self.word2id = word2id

        num_drop = 0 # a counter to count how many data points went into some processing issues
        _words=x

        actual_words = []


        for i in range(len(_words)):
          words = []
          for word in _words[i].split(' '):
              actual_words.append(word)
              words.append(word2id[word])

          words = np.asarray(words)
          label=y[i]
          segment='zuabcedi[8]'
          train.append(((words, actual_words), label, segment))


        word2id.default_factory = return_unk


        # Save pickles
        to_pickle(train, '/content/drive/MyDrive/dataset' + '/train.pkl')


    def get_data(self):

        return self.train, self.word2id


In [None]:
class MSADataset(Dataset):
    def __init__(self):
      data_path='/content/drive/MyDrive/dataset/sentiment_analysis.csv'
      x=[]
      y=[]
      for line in open(data_path):
        line=line.strip()
        splits = line.split(',')
        id=splits[0]
        label=splits[1]
        sentence=splits[2]
        x.append(sentence)
        y.append(label)
        dataset = MOSEI(x,y)

        self.data, self.word2id  = dataset.get_data()
        self.len = len(self.data)


    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

In [None]:
def get_loader(shuffle=True):
    """Load DataLoader of given DialogDataset"""

    dataset = MSADataset()

    def collate_fn(batch):
        '''
        Collate functions assume batch = [Dataset[i] for i in index_set]
        '''
        # for later use we sort the batch in descending order of length
        batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)

        # get the data out of the batch - use pad sequence util functions from PyTorch to pad things

        labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
        sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)

        ## BERT-based features input prep

        SENT_LEN = sentences.size(0)
        # Create bert indices using tokenizer

        bert_details = []
        for sample in batch:
            text = " ".join(sample[0][1])
            encoded_bert_sent = bert_tokenizer.encode_plus(
                text, max_length=SENT_LEN+2, add_special_tokens=True, pad_to_max_length=True)
            bert_details.append(encoded_bert_sent)


        # Bert things are batch_first
        bert_sentences = torch.LongTensor([sample["input_ids"] for sample in bert_details])
        bert_sentence_types = torch.LongTensor([sample["token_type_ids"] for sample in bert_details])
        bert_sentence_att_mask = torch.LongTensor([sample["attention_mask"] for sample in bert_details])


        # lengths are useful later in using RNNs
        lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])

        return sentences, labels, lengths, bert_sentences, bert_sentence_types, bert_sentence_att_mask


    data_loader = DataLoader(
        dataset=dataset,
        batch_size=1,
        shuffle=shuffle,
        collate_fn=collate_fn)

    return data_loader

In [None]:
train_data_loader = get_loader(shuffle=True)

In [None]:
bertconfig = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
bertmodel = BertModel.from_pretrained('bert-base-uncased', config=bertconfig)

In [None]:
for sent, label, lengths, bert_sent, bert_sent_type, bert_sent_mask in train_data_loader:

  bert_output = bertmodel(input_ids=bert_sent,
                                         attention_mask=bert_sent_mask,
                                         token_type_ids=bert_sent_type)
  bert_output = bert_output[0]

  # masked mean
  masked_output = torch.mul(bert_sent_mask.unsqueeze(2), bert_output)
  mask_len = torch.sum(bert_sent_mask, dim=1, keepdim=True)
  bert_output = torch.sum(masked_output, dim=1, keepdim=False) / mask_len

  utterance_text = bert_output

  print(utterance_text.shape )