In [1]:
# 输出hello message
!sudo apt install cowsay -y
!echo "Currently on $(cat /etc/issue)" | /usr/games/cowsay
!nvidia-smi

# 在Google-Colab运行时，切换到Googoe-Drive工作目录
import os,sys,logging
if not os.path.exists("datasets"):
  os.chdir("./drive/MyDrive/ABSA")

# 建立全局logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

# 安装依赖项
!pip install -r requirements.txt | grep ^"already satisfied"
!/usr/games/cowsay -f vader  All python packages installed!

Reading package lists... Done
Building dependency tree       
Reading state information... Done
cowsay is already the newest version (3.03+dfsg2-4).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
 __________________________________
/ Currently on Ubuntu 18.04.5 LTS  \
|                                  |
\ \l                               /
 ----------------------------------
        \   ^__^
         \  (oo)\_______
            (__)\       )\/\
                ||----w |
                ||     ||
Sun Nov 29 13:23:46 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                 

In [2]:
# 定义工具函数，类
import numpy as np
import argparse
from transformers import BertModel,BertTokenizer
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# 设置全局随机种子
def set_random_seed(seed):
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(opt.seed)
# 将序列标准化到同样长度
def normal_sequence(sequence, maxlen, dtype='int64', value=0):
    x = (np.ones(maxlen) * value).astype(dtype)
    trunc = sequence[:maxlen]
    trunc = np.asarray(trunc, dtype=dtype)
    x[:len(trunc)] = trunc
    return x
# 在Bert本身tokenizer功能之上，增加了标准化到指定长度功能
class MyTokenizer:
    def __init__(self, max_seq_len, pretrained_bert_name):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len

    # 使用BertTokenizer将文本转化为id向量，并按照指定的最大长度进行补完
    def text_to_sequence(self, text):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
        sequence = sequence if len(sequence)>0 else [0]
        return normal_sequence(sequence, self.max_seq_len)
# 用于对原始数据进行加工，产生bert可用的数据集
class MyDataset(Dataset):
    def __init__(self, fname, tokenizer):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()

        all_data = []
        for i in range(0, len(lines), 3):
            text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
            aspect = lines[i + 1].lower().strip()
            polarity = lines[i + 2].strip()

            text_indices = tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)
            aspect_indices = tokenizer.text_to_sequence(aspect)
            aspect_len = np.sum(aspect_indices != 0)
            polarity = int(polarity) + 1 # 规范化到正整数

            text_len = np.sum(text_indices != 0)
            concat_bert_indices = tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]")
            concat_segments_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1)
            concat_segments_indices = normal_sequence(concat_segments_indices, tokenizer.max_seq_len)

            data = {
                'concat_bert_indices': concat_bert_indices,
                'concat_segments_indices': concat_segments_indices,
                'polarity': polarity,
            }

            all_data.append(data)
        
        fin.close()
        
        self.data = all_data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)
# 基于Bert的神经网络类，由 bert + dropout + fully-connected 三层构成
class Network(nn.Module):
    def __init__(self, bert, opt):
        super(Network, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(opt.dropout)
        self.dense = nn.Linear(opt.bert_dim, opt.polarities_dim)

    # 前向传播过程
    def forward(self, inputs):
        text_bert_indices, bert_segments_ids = inputs[0], inputs[1]
        _, pooled_output = self.bert(text_bert_indices, token_type_ids=bert_segments_ids)
        pooled_output = self.dropout(pooled_output)
        logits = self.dense(pooled_output)
        return logits
# 主类，
class Instructor:
    def __init__(self, opt):
        # 创建bert-based model
        self.opt = opt
        self.tokenizer = MyTokenizer(opt.max_seq_len, opt.pretrained_bert_name)
        bert = BertModel.from_pretrained(opt.pretrained_bert_name)
        self.model = opt.model_class(bert, opt).to(opt.device)
        
        # 加载训练集
        self.trainset = MyDataset(opt.dataset_file['train'], self.tokenizer)
        assert 0 < opt.valset_ratio < 1
        valset_len = int(len(self.trainset) * opt.valset_ratio)
        self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len)) # 分割一部分训练集为测试集
        self.trainset = MyDataset(opt.dataset_file['train'], self.tokenizer) # 为了性能，将使用全量训练集

        # 检查是否有显卡
        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
        
        self._print_args()

    def _print_args(self):
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in self.model.parameters():
            n_params = torch.prod(torch.tensor(p.shape))
            if p.requires_grad:
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        logger.info('> n_trainable_params: {0}, n_nontrainable_params: {1}'.format(n_trainable_params, n_nontrainable_params))
        logger.info('> training arguments:')
        for arg in vars(self.opt):
            logger.info('>>> {0}: {1}'.format(arg, getattr(self.opt, arg)))

    def _reset_params(self):
        for child in self.model.children():
            if type(child) != BertModel:  # skip bert params
                for p in child.parameters():
                    if p.requires_grad:
                        if len(p.shape) > 1:
                            self.opt.initializer(p)
                        else:
                            stdv = 1. / np.sqrt(p.shape[0])
                            torch.nn.init.uniform_(p, a=-stdv, b=stdv)

    def _train(self, criterion, optimizer, train_data_loader, val_data_loader):
        max_val_acc = 0
        max_val_f1 = 0
        max_val_epoch = 0
        global_step = 0
        path = None
        for i_epoch in range(self.opt.num_epoch):
            logger.info('>' * 100)
            logger.info('epoch: {}'.format(i_epoch))
            n_correct, n_total, loss_total = 0, 0, 0
            # switch model to training mode
            self.model.train()
            for i_batch, batch in enumerate(train_data_loader):
                global_step += 1
                # clear gradient accumulators
                optimizer.zero_grad()

                inputs = [batch[col].to(self.opt.device) for col in self.opt.inputs_cols]
                outputs = self.model(inputs)
                targets = batch['polarity'].to(self.opt.device)

                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
                n_total += len(outputs)
                loss_total += loss.item() * len(outputs)
                if global_step % self.opt.log_step == 0:
                    train_acc = n_correct / n_total
                    train_loss = loss_total / n_total
                    logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))

            val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader)
            logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1))
            if val_acc > max_val_acc:
                max_val_acc = val_acc
                max_val_epoch = i_epoch
                if not os.path.exists('state_dict'):
                    os.mkdir('state_dict')
                path = 'state_dict/val_acc_{}'.format(round(val_acc, 4))
                torch.save(self.model.state_dict(), path)
                logger.info('>> saved: {}'.format(path))
            if val_f1 > max_val_f1:
                max_val_f1 = val_f1
            if i_epoch - max_val_epoch >= self.opt.patience:
                print('>> early stop.')
                break

        return path

    def _evaluate_acc_f1(self, data_loader):
        n_correct, n_total = 0, 0
        t_targets_all, t_outputs_all = None, None
        # switch model to evaluation mode
        self.model.eval()
        with torch.no_grad():
            for i_batch, t_batch in enumerate(data_loader):
                t_inputs = [t_batch[col].to(self.opt.device) for col in self.opt.inputs_cols]
                t_targets = t_batch['polarity'].to(self.opt.device)
                t_outputs = self.model(t_inputs)

                n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
                n_total += len(t_outputs)

                if t_targets_all is None:
                    t_targets_all = t_targets
                    t_outputs_all = t_outputs
                else:
                    t_targets_all = torch.cat((t_targets_all, t_targets), dim=0)
                    t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0)

        acc = n_correct / n_total
        f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro')
        return acc, f1

    
    def evaluate(self, text, aspect):
        aspect = aspect.lower().strip()
        text_left, _, text_right = [s.strip() for s in text.lower().partition(aspect)]
        text_indices = self.tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)
        aspect_indices = self.tokenizer.text_to_sequence(aspect)
        aspect_len = np.sum(aspect_indices != 0)
        text_len = np.sum(text_indices != 0)
        concat_bert_indices = self.tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]")
        concat_segments_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1)
        concat_segments_indices = normal_sequence(concat_segments_indices, self.tokenizer.max_seq_len)

        data = {
            'concat_bert_indices': concat_bert_indices,
            'concat_segments_indices': concat_segments_indices,
        }

        t_inputs = [torch.tensor([data[col]], device=self.opt.device) for col in self.opt.inputs_cols]
        t_outputs = self.model(t_inputs)
        t_probs = F.softmax(t_outputs, dim=-1).cpu().numpy()

        return t_probs

    def run(self):
        # Loss and Optimizer
        _params = filter(lambda p: p.requires_grad, self.model.parameters())
        optimizer = self.opt.optimizer(_params, lr=self.opt.lr, weight_decay=self.opt.l2reg)
        train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True)
        val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False)
        self._reset_params()
        
        # 训练过程，并记录最优模型
        best_model_path = self._train(nn.CrossEntropyLoss(), optimizer, train_data_loader, val_data_loader)
        
        # 将自身重载为最优模型
        self.model.load_state_dict(torch.load(best_model_path))
        print(best_model_path)

In [3]:
# 参数指定，调参在此进行
my_args = "--seed 2020 --lr 1e-5 --pretrained_bert_name bert-ada"

parser = argparse.ArgumentParser()
parser.add_argument('--lr', default=2e-5, type=float)
parser.add_argument('--dropout', default=0.1, type=float)
parser.add_argument('--l2reg', default=0.01, type=float)
parser.add_argument('--num_epoch', default=20, type=int)
parser.add_argument('--batch_size', default=16, type=int)
parser.add_argument('--log_step', default=10, type=int)
parser.add_argument('--bert_dim', default=768, type=int)
parser.add_argument('--pretrained_bert_name', default='bert_base_uncased', type=str)
parser.add_argument('--max_seq_len', default=85, type=int)
parser.add_argument('--polarities_dim', default=3, type=int)
parser.add_argument('--patience', default=5, type=int)
parser.add_argument('--seed', default=1234, type=int, help='set seed for reproducibility')
parser.add_argument('--valset_ratio', default=0.1, type=float, help='set ratio between 0 and 1 for validation support')

opt = parser.parse_args(args=my_args.split())
# 设置随机种子，保证多次训练结果都一样
set_random_seed(opt.seed)
opt.model_class = Network
opt.dataset_file = {'train': './datasets/train.txt'}
opt.inputs_cols = ['concat_bert_indices', 'concat_segments_indices']
opt.initializer = torch.nn.init.xavier_uniform_
opt.optimizer = torch.optim.Adam
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ins = Instructor(opt)
!/usr/games/cowsay "Model Set Up~ Start Training!"

OSError: Unable to load weights from pytorch checkpoint file for 'bert-ada' at 'bert-ada/pytorch_model.bin'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. 

In [None]:
# 训练过程，如果仅进行读取模型和评估，则这一段代码不用运行
from time import strftime, localtime
if not os.path.exists('logs'):
    os.mkdir('logs')
log_file = './logs/{}{}.log'.format(strftime("%y%m%d-%H%M", localtime()),my_args)
logger.addHandler(logging.FileHandler(log_file))

ins.run()

In [None]:
# 重新设置随机种子，保证无论训练完后还是直接载入模型，结果都不会变
set_random_seed(opt.seed)

# 加载当前最好的模型，因为未必每次都会训练，直接人为指定
ins.model.load_state_dict(torch.load('state_dict/leader-0.9092'))
ins.model = ins.model.to(opt.device)
ins.model.eval()
torch.autograd.set_grad_enabled(False)

# 声称预测结果
rfile = open('./datasets/test.txt','r')
wfile = open('./datasets/181220010.txt','w')

for line in rfile.readlines():
    line = line.strip(' \n')
    if "$" in line:
        last_line = line
    else:
        last_line = last_line.split('$')
        new_line = last_line[0] + line + last_line[2]
        print(new_line)
        segmentation = ins.evaluate(new_line, line).argmax(axis=-1) - 1
        wfile.write("{}\n".format(segmentation[0]))

rfile.close()
wfile.close()
