In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification


In [2]:
column_names = ['type','title','text']
dftrain = pd.read_csv('./data_after_sep/train.tsv',sep='\t',names=column_names)
dftest = pd.read_csv('./data_after_sep/test.tsv',sep='\t',names=column_names)
dfdev = pd.read_csv('./data_after_sep/dev.tsv',sep='\t',names=column_names)

In [3]:
pretrain_model_path = './chinese_wwm_pytorch/'
tokenizer = BertTokenizer.from_pretrained(pretrain_model_path)

In [4]:
class TrainDataset(Dataset):
    def __init__(self, input_dict, y ):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]

        return inputid , tokentype , attentionmask, y
    
    def __len__(self):
        return len(self.input_ids)
    
train_texts = dftrain['text'].tolist()
    
train_input_dict = tokenizer.batch_encode_plus(train_texts, 
                                         add_special_tokens=True,
                                         max_length=512,
                                               truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')

In [5]:
BATCH_SIZE = 8
train_y = np.array(dftrain['type'].tolist())
trainset = TrainDataset(train_input_dict,train_y)
trainloader = DataLoader(trainset , batch_size = BATCH_SIZE , shuffle = True)


In [6]:
test_texts = dftest['text'].tolist()
test_input_dict = tokenizer.batch_encode_plus(test_texts, 
                                         add_special_tokens=True,
                                         max_length=512,
                                               truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')

# BATCH_SIZE = 4
test_y = np.array(dftest['type'].tolist())
testset = TrainDataset(test_input_dict,test_y)
testloader = DataLoader(testset , batch_size = BATCH_SIZE , shuffle = True)

In [7]:
def get_test_acc(model , testloader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, \
            masks_tensors, labels = [t.to(device) for t in data]

            # forward pass
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)


            pred = torch.argmax(outputs[1],dim=-1)
            total += labels.size()[0]
            correct += (pred == labels).sum().item()

    return correct/total
    
    

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
num_labels = 7
model = BertForSequenceClassification.from_pretrained(pretrain_model_path,num_labels = num_labels)
model = model.to(device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 3 
for epoch in range(EPOCHS):
    running_loss = 0.0
    i = 0
    total = 0
    correct = 0
    for (i,data) in enumerate(trainloader):
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        optimizer.zero_grad()
        
        pred = torch.argmax(outputs[1],dim=-1)
        total += labels.size()[0]
        correct += (pred == labels).sum().item()

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        print(f'\rEpoch [{epoch+1}/{EPOCHS}] {i}/{len(trainloader)} Loss: {running_loss:.4f} Acc : {(correct/total):.3f}', end='')
    
    test_acc = get_test_acc(model , testloader)
    print(f'Epoch [{epoch+1}/{EPOCHS}] Test acc: {test_acc:.4f}')
#     torch.save(model.state_dict(),'123.pkl')

        




device: cuda:0


Some weights of the model checkpoint at ./chinese_wwm_pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

Epoch [1/3] 3193/3194 Loss: 1056.3622 Acc : 0.899Epoch [1/3] Test acc: 0.8252
Epoch [2/3] 3193/3194 Loss: 535.8695 Acc : 0.944Epoch [2/3] Test acc: 0.8438
Epoch [3/3] 1348/3194 Loss: 117.8977 Acc : 0.972

In [None]:
model = torch.load('bert_cotraining.pkl')
model.eval()

In [None]:
# testset = NewsDataset("test", tokenizer=tokenizer)
# testloader = DataLoader(testset, batch_size=64, 
#                         collate_fn=create_mini_batch)
# tokenizer = BertTokenizer.from_pretrained('./chinese_wwm_pytorch/')

column_names = ['type','title','text']
dftest = pd.read_csv('./data_after_sep/test.tsv',sep='\t',names=column_names)
texts = dftest['text'].tolist()
input_dict = tokenizer.batch_encode_plus(texts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')


BATCH_SIZE = 64
testset = TestDataset(input_dict)
testloader = DataLoader(testset, batch_size=BATCH_SIZE)
predictions = get_predictions(model, testloader)


In [None]:
pred = predictions.cpu().data.numpy()

In [None]:
pred = np.argmax(pred, axis=1)
pred

In [None]:
accuracy = (pred == testans).mean()
print('Your test accuracy is %.6f' % (accuracy * 100))

In [None]:
from transformers import BertForPreTraining

# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# model = BertForPreTraining.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('./chinese_wwm_pytorch/')
model = BertForPreTraining.from_pretrained('./chinese_wwm_pytorch/')

In [None]:
column_names = ['type','title','text']
df_unlabel = pd.read_csv('./udn_for_mct.tsv',sep='\t',names=column_names)
df_all = pd.read_csv('./all_after_mapping.tsv',sep='\t',names=column_names)
li = [df_unlabel,df_all]
df_combine = pd.concat(li)

In [None]:
class TrainDataset(Dataset):
    def __init__(self, input_dict, y ):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]

        return inputid , tokentype , attentionmask, y
    
    def __len__(self):
        return len(self.input_ids)
    

In [None]:
class TestDataset(Dataset):
    def __init__(self, input_dict):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]

        return inputid , tokentype , attentionmask, 
    
    def __len__(self):
        return len(self.input_ids)

In [None]:

# BATCH_SIZE = 4
# pretrainset = PretrainDataset(input_dict)
# pretrainloader = DataLoader(pretrainset, batch_size=BATCH_SIZE)

texts = df_all['text'].tolist()
input_dict = tokenizer.batch_encode_plus(texts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
y = df_all['type'].values
BATCH_SIZE = 4
trainset = TrainDataset(input_dict,y)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
EPOCHS = 5  # 幸運數字
for epoch in range(EPOCHS):
    for data in pretrain_dataloader:   
        tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in data]
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, )
#     torch.save(model, 'model_bert_pretrain_on_udn.pkl')
    model.save_pretrained('./bert_wwm_pretrain_on_news')

    print('epoch:', epoch+1)

In [None]:
def pick_high_confidence_data(result):
    baseline = 0.85
    print(result.shape)
    count = 0
    li = []
    y = []
    for i in range(result.shape[0]):
        _res = result[i]
#         if lstm max value's index equals to tfidf's
        _val , _index = torch.max(_res, 0)
        if _val.item()>=baseline:
                li.append(i)
                y.append(_index.item())
    return np.array(li) , np.array(y)

In [None]:
from transformers import BertForSequenceClassification

i = 0
column_names = ['type','title','text']
df_unlabel = pd.read_csv('./udn_for_mct.tsv',sep='\t',names=column_names)
df_all = pd.read_csv('./all_after_mapping.tsv',sep='\t',names=column_names)
li = [df_unlabel,df_all]
df_combine = pd.concat(li)


NUM_LABELS = 7

tokenizer = BertTokenizer.from_pretrained('./chinese_wwm_pytorch/')
model = BertForSequenceClassification.from_pretrained('./chinese_wwm_pytorch/',num_labels=NUM_LABELS)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

BATCH_SIZE = 4

i = 0
while(1):
    if i==0:
        traintexts = np.array(df_all['text'].tolist())
        train_input_dict = tokenizer.batch_encode_plus(traintexts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
        train_y = np.array(df_all['type'].values)

        unlabeltexts = np.array(df_unlabel['text'].tolist())
        unlabel_input_dict = tokenizer.batch_encode_plus(unlabeltexts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
        i+=1
    else:
        train_input_dict = tokenizer.batch_encode_plus(traintexts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
        unlabel_input_dict = tokenizer.batch_encode_plus(unlabeltexts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
        
        
    trainset = TrainDataset(train_input_dict,train_y)
    trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)

    model = model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    EPOCHS = 2  # 幸運數字
    for epoch in range(EPOCHS):
        step = 0
        running_loss = 0.0
        for data in trainloader:
            tokens_tensors, segments_tensors, \
            masks_tensors, labels = [t.to(device) for t in data]
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors, 
                                labels=labels)
            optimizer.zero_grad()

            loss = outputs[0]
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        torch.save(model, 'bert_cotraining.pkl')
        print('[epoch %d] loss: %.3f' %(epoch + 1, running_loss))
    
    unlabelset = TestDataset(unlabel_input_dict)
    unlabelloader = DataLoader(unlabelset, batch_size=32)
    
    ans_matrix = get_predictions(model, unlabelloader,False)
    
    idx , y = pick_high_confidence_data(ans_matrix)
    
    
    

    unlabel_be_chosen = np.take(unlabeltexts, idx, 0)
    unlabeltexts = np.delete(unlabeltexts,idx,axis=0)
    traintexts = np.concatenate((traintexts,unlabel_be_chosen))
    train_y = np.concatenate((train_y,y))

    torch.save(model, 'bert_cotraining.pkl')

    
    if(unlabeltexts.shape[0]<300):
        break


    
    
    
    
    
    
    

    
        
        

In [None]:
print(ans_matrix.shape)

In [None]:
unlabel_input_dict['input_ids'].shape
