In [16]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as F
from transformers import BertTokenizer
from transformers import AdamW ##新ㄉ 好像比較好
from transformers import RobertaTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

In [2]:
column = ['type','title','text']
train = pd.read_csv('./data_after_sep/train.tsv',sep='\t',names=column)
test =pd.read_csv('./data_after_sep/dev.tsv',sep='\t',names=column)

In [3]:
train_x = train['text']
train_y = train['type']
train_x = np.array(train_x.tolist())
train_y = np.array(train_y)
print(train_x.shape)
print(train_y.shape)

(25546,)
(25546,)


In [4]:
test_x = test['text']
test_y = test['type']
test_x = np.array(test_x.tolist())
test_y = np.array(test_y)
print(test_x.shape)
print(test_y.shape)

(5000,)
(5000,)


In [5]:
MAX_LENGTH = 512

tokenizer = BertTokenizer.from_pretrained('./chinese_roberta_wwm/')
input_dict = tokenizer.batch_encode_plus(train_x,
                                         add_special_tokens = True,
                                         max_length = MAX_LENGTH,
                                         return_special_tokens_mask = True,
                                         pad_to_max_length = True,
                                         return_tensors = 'pt')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
class TrainDataset(Dataset):
    def __init__(self,input_dict,y):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]
        return inputid , tokentype , attentionmask , y
    
    def __len__(self):
        return len(self.input_ids)
    
class TestDataset(Dataset):
    def __init__(self, input_dict):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
       
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        return inputid , tokentype , attentionmask 
    
    def __len__(self):
        return len(self.input_ids)
    

In [7]:
BATCH_SIZE = 4
trainset = TrainDataset(input_dict,train_y)
trainloader = DataLoader(trainset,batch_size = BATCH_SIZE)

In [8]:
test_dict = tokenizer.batch_encode_plus(test_x,
                                         add_special_tokens = True,
                                         max_length = MAX_LENGTH,
                                         return_special_tokens_mask = True,
                                         pad_to_max_length = True,
                                         return_tensors = 'pt')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
TEST_BATCH_SIZE = 64
testset = TrainDataset(test_dict,test_y)
testloader = DataLoader(testset,batch_size = TEST_BATCH_SIZE)

In [11]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    predictions_withoutmax = None
    correct = 0
    total = 0
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens, segments, masks = data[:3]
            outputs = model(input_ids=tokens, 
                            token_type_ids=segments, 
                            attention_mask=masks)
            
            logits = outputs[0]
            after_softmax = F.softmax(logits.data, dim=1)
            _, pred = torch.max(after_softmax, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc

In [20]:
from torch.autograd import Variable
from transformers import BertForSequenceClassification
# from transformers import RobertaForSequenceClassification
NUM_LABELS = 7
model = BertForSequenceClassification.from_pretrained('./chinese_roberta_wwm/',num_labels = NUM_LABELS)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
model.train()

type_weight = torch.FloatTensor([4.0160352e+00, 4.8995013e+00, 1.0133280e+01, 9.4230911e+00,
7.3134841e+00, 1.2293551e+01, 8.0637626e+00]).to(device)
type_loss_func = F.CrossEntropyLoss(weight=type_weight)
optimizer = AdamW(model.parameters(),lr = 2e-5)

EPOCHS = 5
for epoch in range(EPOCHS):
    step = 0
    running_loss = 0
    total = 0
    correct = 0
    for data in trainloader:
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)
#         loss = outputs[0]
        loss =0.0
        pred = outputs[1]
        weight_loss = type_loss_func(pred,labels)
        loss += weight_loss
        total += pred.size()[0]
        pred = torch.argmax(pred,dim=-1)
        correct += (pred==labels).sum().item()

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    torch.save(model.state_dict(),'./roberta_weight'+str(epoch)+'.pkl')
    prediction, acc = get_predictions(model, trainloader, compute_acc=True)
    test_pred,test_acc = get_predictions(model, testloader, compute_acc=True)
    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))
    print(test_acc)

Some weights of the model checkpoint at ./chinese_roberta_wwm/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

cuda:0


RuntimeError: CUDA error: device-side assert triggered

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
model.load_state_dict(torch.load('./roberta_state_dict1.pkl'))
model.eval()
prediction, acc = get_predictions(model, testloader, compute_acc=True)

print(acc)

cuda:0


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.76 GiB total capacity; 100.07 MiB already allocated; 10.56 MiB free; 104.00 MiB reserved in total by PyTorch)

In [14]:
print(acc)

0.7712
