In [66]:
"""
https://buomsoo-kim.github.io/attention/2020/03/26/Attention-mechanism-16.md/

sentiment scores:
very positive = 5
slightly positive = 4
neutral = 3
slightly negative = 2
very negative = 1

"""

'\nhttps://buomsoo-kim.github.io/attention/2020/03/26/Attention-mechanism-16.md/\n\nsentiment scores: \nvery positive = 5 \nslightly positive = 4 \nneutral = 3\nslightly negative = 2 \nvery negative = 1\n\n'

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import re

In [5]:
data = pd.read_csv("Twitter-sentiment-self-drive-DFE.csv", encoding = 'latin-1')
print(data.head())

    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  724227031     True      golden                 236               NaN   
1  724227032     True      golden                 231               NaN   
2  724227033     True      golden                 233               NaN   
3  724227034     True      golden                 240               NaN   
4  724227035     True      golden                 240               NaN   

  sentiment  sentiment:confidence  our_id sentiment_gold  \
0         5                0.7579   10001           5\n4   
1         5                0.8775   10002           5\n4   
2         2                0.6805   10003           2\n1   
3         2                0.8820   10004           2\n1   
4         3                1.0000   10005              3   

                               sentiment_gold_reason  \
0  Author is excited about the development of the...   
1  Author is excited that driverless cars will be...   
2  The author is ske

Preprocessing

In [6]:
NUM_INSTANCES = 3000
MAX_SENT_LEN = 10
tweets = []
sent_scores = []
unique_tokens = set()

for i in tqdm(range(NUM_INSTANCES)):
    rand_idx = np.random.randint(len(data))

    tweet = []
    sentences = data['text'].iloc[rand_idx].split(".")
    for sent in sentences:
        if len(sent) != 0:
            # Get only words
            sent = [x.lower() for x in re.findall(r"\w+", sent)]
            if len(sent) >= MAX_SENT_LEN:
                sent = sent[:MAX_SENT_LEN]
            else:
                for _ in range(MAX_SENT_LEN - len(sent)):
                    sent.append("<pad>")

            tweet.append(sent)
            unique_tokens.update(sent)
    tweets.append(tweet)
    if data['sentiment'].iloc[rand_idx] == "not_relevant":
        sent_scores.append(0)
    else:
        sent_scores.append(int(data["sentiment"].iloc[rand_idx]))

100%|██████████| 3000/3000 [00:00<00:00, 10509.95it/s]


In [7]:
print(len(tweets))
tweet_id = 14
print(tweets[tweet_id])
print(len(tweets[tweet_id]))
print(sent_scores[tweet_id])

3000
[['looks', 'like', 'the', 'google', 'self', 'driving', 'car', 'decided', 'to', 'drive'], ['pic', 'http', 't', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['co', 'xyssrdazge', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']]
3
3


Unique tokens

In [8]:
unique_tokens = list(unique_tokens)
print(len(unique_tokens))
print(unique_tokens[:20])

6275
['reasons', 'gladly', 'cbm', 'simpsons', 'wheel', 'wonder', 'brace', 'zl0mcvqzn5', 'n9cssuhgai', 'type', 'niche', 'y1cz3b9aul', 'technewsdaily', 'robin', 'whump', 'clarionledger', 'kickstarter', 'drudge_report', 'arduino', 'piece']


Numericalize each token

In [9]:
# encode each token into index
for i in tqdm(range(len(tweets))):
#for i in range(len(tweets)):
    for j in range(len(tweets[i])):
        tweets[i][j] = [unique_tokens.index(x) for x in tweets[i][j]]

100%|██████████| 3000/3000 [00:03<00:00, 874.94it/s]


In [10]:
print(tweets[tweet_id])
print(len(tweets[tweet_id]))

[[5931, 3665, 6075, 3459, 3668, 777, 2717, 4333, 865, 5950], [324, 893, 2970, 4182, 4182, 4182, 4182, 4182, 4182, 4182], [3244, 4370, 4182, 4182, 4182, 4182, 4182, 4182, 4182, 4182]]
3


Setting parameters

In [11]:
VOCAB_SIZE = len(unique_tokens)
NUM_CLASSES = len(set(sent_scores))
LEARNING_RATE = 1e-3
NUM_EPOCHS = 1#0
HIDDEN_SIZE = 16
EMBEDDING_DIM = 30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Encoders

In [12]:
class wordEncoder(nn.Module):
  def __init__(self, vocab_size, hidden_size, embedding_dim):
    super(wordEncoder, self).__init__()
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.gru = nn.GRU(embedding_dim, hidden_size, bidirectional = True)

  def forward(self, word, h0):
    word = self.embedding(word).unsqueeze(0).unsqueeze(1)
    out, h0 = self.gru(word, h0)
    return out, h0

In [13]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f8e1fab4be0>

In [14]:
class sentEncoder(nn.Module):
  def __init__(self, hidden_size):
    super(sentEncoder, self).__init__()
    self.hidden_size = hidden_size
    self.gru = nn.GRU(hidden_size, hidden_size, bidirectional = True)

  def forward(self, sentence, h0):
    sentence = sentence.unsqueeze(0).unsqueeze(1)
    out, h0 = self.gru(sentence)
    return out, h0

### Hierarchical Attention Network

In [53]:
class HAN(nn.Module):
  def __init__(self, wordEncoder, sentEncoder, num_classes, device):
    super(HAN, self).__init__()
    self.wordEncoder = wordEncoder
    self.sentEncoder = sentEncoder
    self.device = device
    #self.softmax = nn.Softmax(dim=1)
    self.softmax = nn.Softmax(dim=0)

    # word-level attention
    self.word_attention = nn.Linear(self.wordEncoder.hidden_size*2, self.wordEncoder.hidden_size*2)
    self.u_w = nn.Linear(self.wordEncoder.hidden_size*2, 1, bias = False)

    # sentence-level attention
    self.sent_attention = nn.Linear(self.sentEncoder.hidden_size * 2, self.sentEncoder.hidden_size*2)
    self.u_s = nn.Linear(self.sentEncoder.hidden_size*2, 1, bias = False)

    # final layer
    self.dense_out = nn.Linear(self.sentEncoder.hidden_size*2, num_classes)
    self.log_softmax = nn.LogSoftmax(dim=0)

  def forward(self, document):
    word_attention_weights = []
    sentenc_out = torch.zeros((document.size(0), 2, self.sentEncoder.hidden_size)).to(self.device)
    # iterate on sentences
    h0_sent = torch.zeros(2, 1, self.sentEncoder.hidden_size, dtype = torch.float).to(self.device)
    for i in range(document.size(0)):
      sent = document[i]
      wordenc_out = torch.zeros((sent.size(0), 2, self.wordEncoder.hidden_size)).to(self.device)
      h0_word = torch.zeros(2, 1, self.wordEncoder.hidden_size, dtype = torch.float).to(self.device)
      # iterate on words
      for j in range(sent.size(0)):
        _, h0_word = self.wordEncoder(sent[j], h0_word)
        wordenc_out[j] = h0_word.squeeze()
      #print(wordenc_out)
      wordenc_out = wordenc_out.view(wordenc_out.size(0), -1)
      u_word = torch.tanh(self.word_attention(wordenc_out))
      #print()
      #print("u_word")
      #print(u_word)

      x = self.u_w(u_word)
      #print("~~~~~ x ~~~~~")
      #print(x)
      #print(x.shape)
      #word_weights = self.softmax(self.u_w(u_word))
      #aligned_weights_ = F.softmax(aligned_weights.unsqueeze(0))
      word_weights = self.softmax(x)

      #print()
      #print("word_weights")
      #print(word_weights)
      word_attention_weights.append(word_weights)

      sent_summ_vector = (u_word * word_weights).sum(axis=0)

      _, h0_sent = self.sentEncoder(sent_summ_vector, h0_sent)
      sentenc_out[i] = h0_sent.squeeze()
    sentenc_out = sentenc_out.view(sentenc_out.size(0), -1)
    u_sent = torch.tanh(self.sent_attention(sentenc_out))
    sent_weights = self.softmax(self.u_s(u_sent))
    doc_summ_vector = (u_sent * sent_weights).sum(axis=0)
    out = self.dense_out(doc_summ_vector)
    return word_attention_weights, sent_weights, self.log_softmax(out)

## Training

In [52]:
word_encoder = wordEncoder(VOCAB_SIZE, HIDDEN_SIZE, EMBEDDING_DIM).to(DEVICE)
sent_encoder = sentEncoder(HIDDEN_SIZE * 2).to(DEVICE)
model = HAN(word_encoder, sent_encoder, NUM_CLASSES, DEVICE).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.NLLLoss()
losses = []
weights = []

for i in tqdm(range(NUM_EPOCHS)):
    current_loss = 0
    for j in range(len(tweets[:50])):
        tweet, score = torch.tensor(tweets[j], dtype = torch.long).to(DEVICE), torch.tensor(sent_scores[j]).to(DEVICE)
        word_weights, sent_weights, output = model(tweet)
        optimizer.zero_grad()
        #current_loss += criterion(output.unsqueeze(0), score.unsqueeze(0))
        current_loss = criterion(output.unsqueeze(0), score.unsqueeze(0))
        current_loss.backward(retain_graph=True)
        optimizer.step()

    print(f"epoch {i+1}/{NUM_EPOCHS}, loss: {current_loss}")
    losses.append(current_loss.item()/(j+1))

100%|██████████| 1/1 [00:31<00:00, 31.50s/it]



word_weights
tensor([[0.0861],
        [0.0833],
        [0.1047],
        [0.1055],
        [0.1011],
        [0.1135],
        [0.1072],
        [0.1005],
        [0.0989],
        [0.0993]], grad_fn=<SoftmaxBackward>)

word_weights
tensor([[0.1048],
        [0.1181],
        [0.1057],
        [0.1000],
        [0.0972],
        [0.0958],
        [0.0951],
        [0.0946],
        [0.0944],
        [0.0943]], grad_fn=<SoftmaxBackward>)

word_weights
tensor([[0.0840],
        [0.1063],
        [0.0999],
        [0.0965],
        [0.1111],
        [0.1059],
        [0.1007],
        [0.1086],
        [0.1059],
        [0.0812]], grad_fn=<SoftmaxBackward>)

word_weights
tensor([[0.1049],
        [0.1004],
        [0.0984],
        [0.0981],
        [0.0985],
        [0.0991],
        [0.0996],
        [0.1000],
        [0.1004],
        [0.1006]], grad_fn=<SoftmaxBackward>)

word_weights
tensor([[0.0985],
        [0.0812],
        [0.0785],
        [0.0997],
        [0.1110],
        

In [82]:
with torch.no_grad():
    tweet, score = torch.tensor(tweets[50], dtype = torch.long).to(DEVICE), torch.tensor(sent_scores[j]).to(DEVICE)
    print(tweet)
    print("Class:", score.item())

    print("~~~ RESULTS ~~~")
    word_weights, sent_weights, output = model(tweet)
    print(output)
    print(word_weights)
    print(sent_weights)
    print("~~~ Prediction ~~~")
    _, idx = torch.max(output, 0)
    print("Class:",idx.item())

tensor([[3796, 1081, 4739, 5202, 4182, 4182, 4182, 4182, 4182, 4182],
        [5084, 3871, 5270,  711, 5173, 3668,  777,  968, 4182, 4182],
        [1615, 4962, 3188, 3808,  893, 2970, 4182, 4182, 4182, 4182],
        [3244,  293, 4182, 4182, 4182, 4182, 4182, 4182, 4182, 4182]])
Class: 3
~~~ RESULTS ~~~

word_weights
tensor([[0.0771],
        [0.0996],
        [0.0989],
        [0.0870],
        [0.0989],
        [0.1040],
        [0.1067],
        [0.1083],
        [0.1094],
        [0.1101]])

word_weights
tensor([[0.0886],
        [0.0913],
        [0.0833],
        [0.0881],
        [0.1001],
        [0.0974],
        [0.0992],
        [0.1079],
        [0.1194],
        [0.1247]])

word_weights
tensor([[0.0879],
        [0.0874],
        [0.0925],
        [0.0901],
        [0.0887],
        [0.0811],
        [0.1070],
        [0.1172],
        [0.1225],
        [0.1256]])

word_weights
tensor([[0.0721],
        [0.0747],
        [0.0947],
        [0.1023],
        [0.1061],
     

In [83]:
for t in tweet:
    t = t.numpy()
    sent = " ".join([unique_tokens[w] for w in t])
    print(sent)

if score.item() == idx.item():
    print(f"Correct: {score.item()}")
else:
    print(f"Truth: {score.item()}, Predicted:{idx.item()}")

testing a rover nasaames <pad> <pad> <pad> <pad> <pad> <pad>
it uses same tech as self driving cars <pad> <pad>
stateofnasa nasasocial nasa û_ http t <pad> <pad> <pad> <pad>
co pin2j8fusj <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Correct: 3
