#### for colab

In [None]:
# !pip install transformers
# !pip install colab-ssh --upgrade

# from colab_ssh import launch_ssh_cloudflared, init_git_cloudflared
# launch_ssh_cloudflared(password='0000')

In [None]:
# from google.colab import drive

# # mount Google Drive
# drive.mount('/content/drive', force_remount=True)
# GDRIVE_HOME = '/content/drive/MyDrive'

## Experiment options

In [5]:
from easydict import EasyDict
import torch

opt = EasyDict()
opt.dataset_series = 'SemEval-16' # SemEval-16, sentihood
opt.dataset_domain = 'restaurant' # restaurant / laptop
opt.subtask = 'sub1' # sub1: sentence, sub2: document(full review)
opt.num_classes = 3 # negative, positive, neutral, (+ conflict)
opt.max_length = 200
opt.model_name = 'bert_high_attention_top_k_lastid_rnn' # bert_intermediate_base / bert_intermediate_att
opt.pos = True
opt.lastid = True
opt.valset_ratio = 0.2
opt.batch_size = 16
opt.num_layers = 6 # bert intermediate
opt.num_epochs = 10
opt.seed = 42
opt.log_step = 100
opt.patience = 5
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(opt.device)

cpu


## Load Dataset

In [7]:
import os, sys
# research_root = os.path.join(GDRIVE_HOME, 'research')
# sys.path.append(research_root)

if opt.dataset_series == 'SemEval-16':
    path = 'dataset/{}/semeval16_{}_{}.csv'.format(opt.dataset_series, opt.subtask, opt.dataset_domain)
    path_test = 'dataset/{}/semeval16_{}_{}_test.csv'.format(opt.dataset_series, opt.subtask, opt.dataset_domain)
elif opt.dataset_series == 'sentihood':
    path = 'dataset/{}/sentihood_train.csv'.format(opt.dataset_series)
    path_test = 'dataset/{}/sentihood_test.csv'.format(opt.dataset_series)

import pandas as pd

df_train = pd.read_csv(path)
df_test = pd.read_csv(path_test)

print('length of train set: {:,}'.format(len(df_train)))
print('length of test set: {:,}'.format(len(df_test)))

df_train.head()

length of train set: 2,507
length of test set: 859


Unnamed: 0,re_idx,idx,sentence,category,polarity,target,from,to
0,0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,negative,place,51,56
1,1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,negative,staff,75,80
2,2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,negative,,0,0
3,3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,negative,food,4,8
4,3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,negative,portions,52,60


In [9]:
from data_utils import Category_Classification_Dataset as Dataset
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

trainset = Dataset(df=df_train, tokenizer=tokenizer, max_length=opt.max_length, pos_encoding=False)
testset = Dataset(df=df_test, tokenizer=tokenizer, max_length=opt.max_length, pos_encoding=False)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

2,507 samples in this dataset
859 samples in this dataset


In [10]:
print(trainset.get_sample(423))
print('-'*30)
print(trainset[423])

Sentence: I have reservations about the all you can eat deal, however -- the choices are fairly limited and you can probably order more food than you can eat for less than $18 by just going off the menu.
Aspect Category: FOOD#STYLE_OPTIONS
Polarity: negative
Input IDs: tensor([[  101,  1045,  2031, 17829,  2055,  1996,  2035,  2017,  2064,  4521,
          3066,  1010,  2174,  1011,  1011,  1996,  9804,  2024,  7199,  3132,
          1998,  2017,  2064,  2763,  2344,  2062,  2833,  2084,  2017,  2064,
          4521,  2005,  2625,  2084,  1002,  2324,  2011,  2074,  2183,  2125,
          1996, 12183,  1012,   102,  2833,  2806,  1035,  7047,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,    

In [11]:
from torch.utils.data import random_split
import random
import numpy as np

# Stable Random Seed
SEED = opt.seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # ?
torch.backends.cudnn.benchmark = False # ?
os.environ['PYTHONHASHSEED'] = str(SEED)

val_ratio = opt.valset_ratio
num_val = int(len(trainset) * val_ratio)
num_train = len(trainset) - num_val
train_set, val_set = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(SEED))
#train_asp_idx, val_asp_idx = random_split(train_asp_idxs, [num_train, num_val], generator=torch.Generator().manual_seed(SEED)) # get aspect index
test_set = testset

print('Ratio of datasets: {} : {} : {}'.format(len(train_set), len(val_set), len(test_set)))

Ratio of datasets: 2006 : 501 : 859


In [12]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_set, batch_size=opt.batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=opt.batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_set, batch_size=opt.batch_size, shuffle=False)

In [13]:
sample_batch = iter(train_loader).next()
sample_batch

{'input_ids': tensor([[[  101,  1996,  3295,  ...,     0,     0,     0]],
 
         [[  101,  2022,  2469,  ...,     0,     0,     0]],
 
         [[  101,  1996, 24857,  ...,     0,     0,     0]],
 
         ...,
 
         [[  101,  1996, 14163,  ...,     0,     0,     0]],
 
         [[  101,  2326,  2003,  ...,     0,     0,     0]],
 
         [[  101,  2204, 20861,  ...,     0,     0,     0]]]),
 'attention_masks': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         ...,
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]]),
 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         ...,
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]]]),
 'labels': tensor([1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1

## Model

1. pair 단어들 (102번 사이)과 첫 문장 단어들 간의 attention score 합을 기준으로 top-k개 단어 선별
    - 그 단어들의 mean pool
    - 그 단어들을 rnn layer에?


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertForSequenceClassification

class Bert_Att_Scores(nn.Module):
    def __init__(self, opt, embed_dim=768, fc_hidden_dim=128, top_k=3, att_pooling='mean'):
        super(Bert_Att_Scores, self).__init__()
        self.num_classes = opt.num_classes
        self.embed_dim = embed_dim
        self.fc_hidden_dim = fc_hidden_dim
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.top_k = top_k
        self.att_pooling = att_pooling
        if att_pooling == 'concat':
            self.fc1 = nn.Linear((self.embed_dim)*top_k, self.num_classes)
        else:
            self.fc1 = nn.Linear(self.embed_dim, self.num_classes)
        self.dropout = nn.Dropout(p=0.5)
        #self.fc2 = nn.Linear(self.fc_hidden_dim, self.num_classes)
        self.device = opt.device

    def forward(self, input_ids, att_mask, token_ids, pos_ids, last_ids):

        asp_ids = list()
        for i in pos_ids:
            ids = (pos_ids==0).nonzero(as_tuple=True)[0].tolist()
            asp_ids.append(ids)

        output_dict = self.bert(input_ids, attention_mask=att_mask, token_type_ids=token_ids,
                output_attentions=True, encoder_hidden_states=True, return_dict=True)
        
        # get top-k att idx in final att layer
        atts = output_dict.attentions[-1]
        in_batch_atts = list()
        for a in atts:
            in_batch_atts.append(sum(a) / a.size(0)) # average of all att. heads, each (8, 200, 200)
        top_k_idx = list()
        for att, asp, last in zip(in_batch_atts, asp_ids, last_ids):
            sum_ = sum(att[asp[0]:(asp[-1]+1), :]) # sum attention scores for multi-aspect words (1, 200)
            idxs = torch.sort(sum_[1:last+1], descending=True).indices[:self.top_k] + 1 # exclude 0(<CLS>), last(<SEP>)
            top_k_idx.append(idxs) # re sum 1 to include 0(<CLS>)
        # len(top_k_idx): batch_size
        # top_k_idx[0].shape = [1,3]

        # get top-k hidden states
        hids = output_dict.last_hidden_state
        output = self.get_k_hiddens(last_hiddens=hids, idx_list=top_k_idx, pooling=self.att_pooling) # self.get_k...
        #print(output.shape)
        output = self.fc1(output)
        #print(output.shape)
        return output

    def get_k_hiddens(self, last_hiddens, idx_list, pooling='mean'):
        '''
        @args
        last_hiddens: bert last hidden states
        idx_list: top_k_idxs
        pooling: how get final rep. vectors, 'sum', 'mean', 'concat'
        '''
        final = list()
        for idx, hid in zip(idx_list, last_hiddens):
            if pooling=='sum':
                final.append(torch.sum(hid[idx, :], dim=0).unsqueeze(0)) # (1, 768)
            elif pooling=='mean' or pooling=='average':
                final.append(torch.mean(hid[idx, :], dim=0).unsqueeze(0)) # (1, 768)
            elif pooling=='concat':
                final.append(hid[idx, :].view(1, -1)) # (1, 768*k)
        final = torch.cat(final, dim=0) # to tensor (batch_size, 768) or (batch_size, 768*k) (concat)
        return final