In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
!git clone https://github.com/p208p2002/albert-zh-for-pytorch-transformers.git albert

fatal: destination path 'albert' already exists and is not an empty directory.


In [5]:
!wget https://raw.githubusercontent.com/p208p2002/taipei-QA-BERT/master/Taipei_QA_new.txt

--2022-09-10 09:54:52--  https://raw.githubusercontent.com/p208p2002/taipei-QA-BERT/master/Taipei_QA_new.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 880722 (860K) [text/plain]
Saving to: ‘Taipei_QA_new.txt’


2022-09-10 09:54:53 (15.3 MB/s) - ‘Taipei_QA_new.txt’ saved [880722/880722]



In [6]:
!mkdir trained_model

In [7]:
import torch
from torch.utils.data import TensorDataset
import pickle

In [8]:
import sys 
sys.path.append('.')
sys.path

['/content',
 '/env/python',
 '/usr/lib/python37.zip',
 '/usr/lib/python3.7',
 '/usr/lib/python3.7/lib-dynload',
 '',
 '/usr/local/lib/python3.7/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.7/dist-packages/IPython/extensions',
 '/root/.ipython',
 '.']

In [9]:
# from albert.albert_zh import AlbertConfig, AlbertTokenizer, AlbertForSequenceClassification

In [10]:
def use_model(model_name, config_file_path, model_file_path, vocab_file_path, num_labels):
    # 選擇模型並加載設定
    if(model_name == 'bert'):
        from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
        model_config, model_class, model_tokenizer = (BertConfig, BertForSequenceClassification, BertTokenizer)
        config = model_config.from_pretrained(config_file_path,num_labels = num_labels)
        model = model_class.from_pretrained(model_file_path, from_tf=bool('.ckpt' in 'bert-base-chinese'), config=config)
        tokenizer = model_tokenizer(vocab_file=vocab_file_path)
        return model, tokenizer
    elif(model_name == 'albert'):
        from albert.albert_zh import AlbertConfig, AlbertTokenizer, AlbertForSequenceClassification
        model_config, model_class, model_tokenizer = (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
        config = model_config.from_pretrained(config_file_path,num_labels = num_labels)
        model = model_class.from_pretrained(model_file_path, config=config)
        tokenizer = model_tokenizer.from_pretrained(vocab_file_path)
        return model, tokenizer

In [11]:
def compute_accuracy(y_pred, y_target):
    # 計算正確率
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [12]:
def to_bert_ids(tokenizer,q_input):
    # 將文字輸入轉換成對應的id編號
    return tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q_input)))

In [13]:
def make_dataset(input_ids, input_masks, input_segment_ids, answer_lables):
    all_input_ids = torch.tensor([input_id for input_id in input_ids], dtype=torch.long)
    all_input_masks = torch.tensor([input_mask for input_mask in input_masks], dtype=torch.long)
    all_input_segment_ids = torch.tensor([input_segment_id for input_segment_id in input_segment_ids], dtype=torch.long)
    all_answer_lables = torch.tensor([answer_lable for answer_lable in answer_lables], dtype=torch.long)    
    return TensorDataset(all_input_ids, all_input_masks, all_input_segment_ids, all_answer_lables)

In [14]:
def split_dataset(full_dataset, split_rate=0.8):  
    train_size = int(split_rate * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
    return train_dataset,test_dataset

In [15]:
class DataDic(object):
    def __init__(self, answers):
        self.answers = answers #全部答案(含重複)
        self.answers_norepeat = sorted(list(set(answers))) # 不重複
        self.answers_types = len(self.answers_norepeat) # 總共多少類
        self.ans_list = [] # 用於查找id或是text的list
        self._make_dic() # 製作字典
    
    def _make_dic(self):
        for index_a,a in enumerate(self.answers_norepeat):
            if a != None:
                self.ans_list.append((index_a,a))

    def to_id(self,text):
        for ans_id,ans_text in self.ans_list:
            if text == ans_text:
                return ans_id

    def to_text(self,id):
        for ans_id,ans_text in self.ans_list:
            if id == ans_id:
                return ans_text

    @property
    def types(self):
        return self.answers_types
    
    @property
    def data(self):
        return self.answers

    def __len__(self):
        return len(self.answers)

In [16]:
def convert_data_to_feature(tokenizer, train_data_path):
    with open(train_data_path,'r',encoding='utf-8') as f:
        data = f.read()
    qa_pairs = data.split("\n")

    questions = []
    answers = []
    for qa_pair in qa_pairs:
        qa_pair = qa_pair.split()
        try:
            a,q = qa_pair
            questions.append(q)
            answers.append(a)
        except:
            continue
    
    assert len(answers) == len(questions)
    
    ans_dic = DataDic(answers)
    question_dic = DataDic(questions)

    q_tokens = []
    max_seq_len = 0
    for q in question_dic.data:
        bert_ids = to_bert_ids(tokenizer,q)
        if(len(bert_ids)>max_seq_len):
            max_seq_len = len(bert_ids)
        q_tokens.append(bert_ids)
        # print(tokenizer.convert_ids_to_tokens(tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q)))))
    
    print("最長問句長度:",max_seq_len)
    assert max_seq_len <= 512 # 小於BERT-base長度限制

    # 補齊長度
    for q in q_tokens:
        while len(q)<max_seq_len:
            q.append(0)
    
    a_labels = []
    for a in ans_dic.data:
        a_labels.append(ans_dic.to_id(a))
        # print (ans_dic.to_id(a))
    
    # BERT input embedding
    answer_lables = a_labels
    input_ids = q_tokens
    input_masks = [[1]*max_seq_len for i in range(len(question_dic))]
    input_segment_ids = [[0]*max_seq_len for i in range(len(question_dic))]
    assert len(input_ids) == len(question_dic) and len(input_ids) == len(input_masks) and len(input_ids) == len(input_segment_ids)

    data_features = {'input_ids':input_ids,
                    'input_masks':input_masks,
                    'input_segment_ids':input_segment_ids,
                    'answer_lables':answer_lables,
                    'question_dic':question_dic,
                    'answer_dic':ans_dic}
    
    output = open('trained_model/data_features.pkl', 'wb')
    pickle.dump(data_features,output)
    return data_features

In [17]:
!pip install transformers==2.3.0
from torch.utils.data import DataLoader
import torch
from transformers import AdamW

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==2.3.0
  Downloading transformers-2.3.0-py3-none-any.whl (447 kB)
[K     |████████████████████████████████| 447 kB 3.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 33.0 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.24.70-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 65.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 48.3 MB/s 
Collecting botocore<1.28.0,>=1.27.70
  Downloading botocore-1.27.70-py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 41.7 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloa

In [None]:
if __name__ == "__main__":    
    # BERT
#     model_setting = {
#         "model_name":"bert", 
#         "config_file_path":"bert-base-chinese", 
#         "model_file_path":"bert-base-chinese", 
#         "vocab_file_path":"bert-base-chinese-vocab.txt",
#         "num_labels":149  # 分幾類 
#     }    

    # ALBERT
    model_setting = {
        "model_name":"albert", 
        "config_file_path":"albert/albert_tiny/config.json", 
        "model_file_path":"albert/albert_tiny/pytorch_model.bin", 
        "vocab_file_path":"albert/albert_tiny/vocab.txt",
        "num_labels":149 # 分幾類
    }    

    #
    model, tokenizer = use_model(**model_setting)
    
    # setting device    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("using device",device)
    model.to(device)

    #    
    data_feature = convert_data_to_feature(tokenizer,'Taipei_QA_new.txt')
    input_ids = data_feature['input_ids']
    input_masks = data_feature['input_masks']
    input_segment_ids = data_feature['input_segment_ids']
    answer_lables = data_feature['answer_lables']
    
    #
    full_dataset = make_dataset(input_ids = input_ids, input_masks = input_masks, input_segment_ids = input_segment_ids, answer_lables = answer_lables)
    train_dataset, test_dataset = split_dataset(full_dataset, 0.9)
    train_dataloader = DataLoader(train_dataset,batch_size=16,shuffle=True)
    test_dataloader = DataLoader(test_dataset,batch_size=16,shuffle=True)    

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6, eps=1e-8)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)

    model.zero_grad()
    for epoch in range(30):
        running_loss_val = 0.0
        running_acc = 0.0
        for batch_index, batch_dict in enumerate(train_dataloader):
            model.train()
            batch_dict = tuple(t.to(device) for t in batch_dict)
            outputs = model(
                batch_dict[0],
                # attention_mask=batch_dict[1],
                labels = batch_dict[3]
                )
            loss,logits = outputs[:2]
            loss.sum().backward()
            optimizer.step()
            # scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            
            # compute the loss
            loss_t = loss.item()
            running_loss_val += (loss_t - running_loss_val) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(logits, batch_dict[3])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # log
            print("epoch:%2d batch:%4d train_loss:%2.4f train_acc:%3.4f"%(epoch+1, batch_index+1, running_loss_val, running_acc))
        
        running_loss_val = 0.0
        running_acc = 0.0
        for batch_index, batch_dict in enumerate(test_dataloader):
            model.eval()
            batch_dict = tuple(t.to(device) for t in batch_dict)
            outputs = model(
                batch_dict[0],
                # attention_mask=batch_dict[1],
                labels = batch_dict[3]
                )
            loss,logits = outputs[:2]
            
            # compute the loss
            loss_t = loss.item()
            running_loss_val += (loss_t - running_loss_val) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(logits, batch_dict[3])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # log
            print("epoch:%2d batch:%4d test_loss:%2.4f test_acc:%3.4f"%(epoch+1, batch_index+1, running_loss_val, running_acc))
    
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained('trained_model')

using device cpu
最長問句長度: 159


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1174.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


epoch: 1 batch:   1 train_loss:5.0477 train_acc:0.0000
epoch: 1 batch:   2 train_loss:5.0710 train_acc:0.0000
epoch: 1 batch:   3 train_loss:5.0022 train_acc:0.0000
epoch: 1 batch:   4 train_loss:5.0045 train_acc:0.0000
epoch: 1 batch:   5 train_loss:4.9994 train_acc:0.0000
epoch: 1 batch:   6 train_loss:5.0095 train_acc:0.0000
epoch: 1 batch:   7 train_loss:5.0309 train_acc:0.0000
epoch: 1 batch:   8 train_loss:5.0294 train_acc:0.0000
epoch: 1 batch:   9 train_loss:5.0261 train_acc:0.0000
epoch: 1 batch:  10 train_loss:5.0397 train_acc:0.0000
epoch: 1 batch:  11 train_loss:5.0396 train_acc:0.0000
epoch: 1 batch:  12 train_loss:5.0388 train_acc:0.0000
epoch: 1 batch:  13 train_loss:5.0361 train_acc:0.0000
epoch: 1 batch:  14 train_loss:5.0383 train_acc:0.0000
epoch: 1 batch:  15 train_loss:5.0360 train_acc:0.0000
epoch: 1 batch:  16 train_loss:5.0395 train_acc:0.0000
epoch: 1 batch:  17 train_loss:5.0330 train_acc:0.3676
epoch: 1 batch:  18 train_loss:5.0286 train_acc:0.6944
epoch: 1 b