In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer
# from IPython.display import clear_output
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification


In [2]:
from transformers import BertForPreTraining
lm_path = './bert_wwm_pretrain_tbrain/'
tokenizer = BertTokenizer.from_pretrained(lm_path)
model = BertForPreTraining.from_pretrained(lm_path)

In [3]:
class TrainDataset(Dataset):
    def __init__(self, input_dict, y ):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]

        return inputid , tokentype , attentionmask, y
    
    def __len__(self):
        return len(self.input_ids)
    

In [21]:
import pandas as pd
train_df = pd.read_csv('./tbrain/train_name_2word_all.csv')
train_df.columns = ['name','ans']

In [22]:
x = train_df['name'].tolist()
y = train_df['ans'].tolist()

x = np.array(x)
y = np.array(y)

In [23]:
print(len(x))
print(len(y))

13230
13230


In [24]:
input_dict = tokenizer.batch_encode_plus(x, 
                                         add_special_tokens=True,
                                         max_length=4,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
BATCH_SIZE = 32
trainset = TrainDataset(input_dict,y)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [25]:
print(len(input_dict['input_ids']))

13230


In [26]:
from transformers import BertForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print("device:", device)

lm_path = './bert_wwm_pretrain_tbrain/'
NUM_LABELS = 2
tokenizer = BertTokenizer.from_pretrained(lm_path)
model = BertForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
model = model.to(device)
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

EPOCHS = 3  # 幸運數字
for epoch in range(EPOCHS):
    running_loss = 0.0
    total = 0
    correct = 0
    for data in trainloader:
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)
        loss = outputs[0]
        
        
        pred = outputs[1]
        total += pred.size()[0]
        pred = torch.argmax(pred,dim=-1)
        correct += (pred==labels).sum().item()

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
    torch.save(model.state_dict(),'./TB_multispan/Bert_wwm_name_model_2words_all_' + str(epoch) + '.pkl')
    print('[epoch %d] loss: %.3f' %(epoch + 1, running_loss))
    print(correct/total)



device: cpu


Some weights of the model checkpoint at ./bert_wwm_pretrain_tbrain/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

[epoch 1] loss: 14.843
0.9863189720332578
[epoch 2] loss: 3.263
0.9979591836734694
[epoch 3] loss: 2.187
0.9988662131519275


In [57]:
# testing

class TestDataset(Dataset):
    def __init__(self, input_dict ):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]

        return inputid , tokentype , attentionmask
    
    def __len__(self):
        return len(self.input_ids)
    
    
def two_words_is_name(name):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
    print("device:", device)


    lm_path = './bert_wwm_pretrain_tbrain/'
    tokenizer = BertTokenizer.from_pretrained(lm_path)


    input_dict = tokenizer.encode_plus(name, 
                                         add_special_tokens=True,
                                         max_length=4,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
    BATCH_SIZE = 1
    testset = TestDataset(input_dict)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)


    model = BertForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
    check_point = './TB_multispan/Bert_wwm_name_model_2words_all_2.pkl'
    model.load_state_dict(torch.load(check_point))
    model = model.to(device)
    model.eval()


    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, \
            masks_tensors = [t.to(device) for t in data]
            optimizer.zero_grad()
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
            pred = torch.argmax(outputs[0][0] , dim = 0)
            return pred.item()
        
        
        
        



In [63]:
two_words_is_name('陳男')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


device: cpu


Some weights of the model checkpoint at ./bert_wwm_pretrain_tbrain/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

0