In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
data_path = r'/content/drive/MyDrive/dataset/sentiment_analysis.csv'

In [6]:
from collections import defaultdict
import numpy as np

In [7]:
word2id = defaultdict(lambda: len(word2id))
PAD=word2id['']

In [8]:
sentences=[]
labels=[]
for line in open(data_path):
  line=line.strip()
  splits = line.split(',')
  id=splits[0]
  if id=='id':
    continue
  label=np.asarray([float(splits[1])])
  sent=splits[2]
  labels.append(label)
  sentences.append(sent)

data=[]

for i in range(len(sentences)):
  words=[]
  actual_words=[]
  for word in sentences[i].split(' '):

    actual_words.append(word)
    words.append(word2id[word])
  words=np.asarray(words)
  label=labels[i]
  data.append(((words,actual_words),label))


In [9]:
train=data[:4000]
val=data[4000:6000]
test=data[6000:]
print(len(train),len(val),len(test))

4000 2000 1920


In [10]:
from torch.utils.data import Dataset

In [11]:
def get_data(mode):
  if mode=='train':
    return train
  elif mode=='val':
    return val
  elif mode=='test':
    return test
  else:
    print('Mode is not set properly')
    assert(False)

In [12]:
class SADataset(Dataset):
  def __init__(self, mode):
    self.data=get_data(mode)
    self.len= len(self.data)
  def __getitem__(self, index):
    return self.data[index]
  def __len__(self):
    return self.len

In [13]:
import torch

In [14]:
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer

In [15]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
from torch.utils.data import DataLoader

In [17]:
def get_loader(mode, shuffle=True):
  dataset=SADataset(mode)

  def collate_fn(batch):

    batch=sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
    sentences=pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)

    SENT_LEN = sentences.size(0)

    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)

    bert_details=[]
    for sample in batch:
      text=" ".join(sample[0][1])

      encoded_bert_sent=bert_tokenizer.encode_plus(
                    text, max_length=SENT_LEN+2, add_special_tokens=True, pad_to_max_length=True)
      bert_details.append(encoded_bert_sent)

    bert_sentences = torch.LongTensor([sample["input_ids"] for sample in bert_details])
    bert_sentences_types = torch.LongTensor([sample["token_type_ids"] for sample in bert_details])
    bert_sentence_att_mask = torch.LongTensor([sample["attention_mask"] for sample in bert_details])

    return labels, bert_sentences, bert_sentences_types, bert_sentence_att_mask

  data_loader = DataLoader(dataset=dataset,
                           batch_size=64,
                           shuffle=shuffle,
                           collate_fn=collate_fn)

  return data_loader


In [18]:
train_data_loader = get_loader(mode='train', shuffle=True)

In [19]:
from transformers import BertConfig, BertModel

In [20]:
bertconfig = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
bertmodel = BertModel.from_pretrained('bert-base-uncased', config=bertconfig)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [21]:
def get_bert_output(bertmodel, bert_sent, bert_sent_type, bert_sent_mask):
  bert_output=bertmodel(input_ids=bert_sent,
                                attention_mask=bert_sent_mask,
                                token_type_ids=bert_sent_type)

  bert_output=bert_output[0]

  batch_size = bert_output.shape[0]

  masked_output=torch.mul(bert_sent_mask.unsqueeze(2), bert_output)
  mask_len=torch.sum(bert_sent_mask, dim=1, keepdim=True)
  bert_output=torch.sum(masked_output, dim=1, keepdim=False) / mask_len

  return bert_output

In [24]:
import torch.nn as nn

In [52]:
class LSTM(nn.Module):
  def __init__(self):
    super(LSTM, self).__init__()
    rnn = nn.LSTM
    hidden_size=16
    self.text_rnn1=rnn(768, hidden_size, bidirectional=False)
    self.text_rnn2=rnn(hidden_size, hidden_size, bidirectional=False)

    self.fc = nn.Linear(hidden_size, 2)

  def forward(self, x):
    _, (vec1, _) = self.text_rnn1(x)
    _, (vec2, _) = self.text_rnn2(vec1)
    vec2=torch.squeeze(vec2, dim=0)
    print('vec2:', vec2.shape)
    final_out = self.fc(vec2)
    print('f:', final_out.shape)

    return final_out

In [53]:
my_model=LSTM()

In [59]:
from torch import optim

In [61]:
loss_fn = nn.CrossEntropyLoss(reduction='mean')
optimizer = optim.Adam(filter(lambda p: p.requires_grad, my_model.parameters()), lr=0.001)

In [55]:
for batch in train_data_loader:
  label, bert_sent, bert_sent_type, bert_sent_mask = batch

  bert_output = get_bert_output(bertmodel, bert_sent, bert_sent_type, bert_sent_mask)

  bert_output = torch.unsqueeze(bert_output,dim=0)


  pred = my_model(bert_output)

  print(pred[:5], label[:5])
  assert(False)



vec2: torch.Size([64, 16])
f: torch.Size([64, 2])
tensor([[-0.1950,  0.1266],
        [-0.1930,  0.1353],
        [-0.1878,  0.1136],
        [-0.2020,  0.1368],
        [-0.2319,  0.1214]], grad_fn=<SliceBackward0>) tensor([1., 0., 1., 1., 0.], dtype=torch.float64)


AssertionError: 