In [36]:
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification
from transformers import BertTokenizer



In [2]:
columns_name = ['type','title','text']
dftrain = pd.read_csv('./data_after_sep/train.tsv',sep = '\t',names = columns_name)
dftest = pd.read_csv('./data_after_sep/test.tsv',sep = '\t',names = columns_name)
dfdev = pd.read_csv('./data_after_sep/dev.tsv',sep = '\t',names = columns_name)


model_path = './bert_pretrain_news/'
tokenizer = BertTokenizer.from_pretrained(model_path)


In [3]:
class TrainDataset(Dataset):
    def __init__(self, input_dict, y):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]
        return input_id, tokentype, attentionmask, y
    def __len__(self):
        return len(self.input_ids)

In [4]:
train_x = dftrain['text'].tolist()
train_input_dict = tokenizer.batch_encode_plus(train_x,
                                              add_special_tokens = True,
                                              max_length = 512,
                                              truncation = True,                ##是否截斷
                                              return_special_tokens_mask = True,
                                              pad_to_max_length = True,
                                              return_tensors = 'pt')
TRAIN_BATCH_SIZE = 64
train_y = np.array(dftrain['type'].tolist())       ##np.array
trainset = TrainDataset(train_input_dict, train_y) ##trainset參數如init
trainloader = DataLoader(trainset, batch_size = TRAIN_BATCH_SIZE, shuffle = True)  

In [5]:
BATCH_SIZE = 64
test_x = dftest['text'].tolist()
test_input_dict = tokenizer.batch_encode_plus(test_x,
                                             add_special_tokens = True,
                                             max_length = 512,
                                             truncation = True,
                                             return_special_tokens_mask = True,
                                             pad_to_max_length = True,
                                             return_tensors = 'pt')
test_y = np.array(dftest['type'].tolist())
testset = TrainDataset(test_input_dict, test_y)
testloader = DataLoader(testset, batch_size = BATCH_SIZE, shuffle = True)


In [6]:
a = np.zeros((64,7))
b = np.zeros((64,7))
c = 

In [130]:
def get_model_pred(model_list,testloader):
    count = 0
    pred_list = []
    ans_list = []
    with torch.no_grad():
        pred_concat = np.array([25546,7]) ##存三個分別的pred concat
        for model in model_list:
            pred = []
            for data in testloader:
                token_tensors,segment_tensors,masks_tensors,labels = [t.to(device) for t in data]
                outputs = model(input_ids = token_tensors,
                                token_type_ids = segment_tensors,
                                attention_mask = masks_tensors,
                                labels = labels)
                for i in range(labels.size()[0]):
                    if(count == 0):  ##labels拿一次就好
                        ans_list.append(labels[i].to("cpu").numpy())
#                     result = torch.softmax(outputs[1][i],dim=-1)
                    pred.append(outputs[1][i].to("cpu").numpy())
            pred = np.array(pred) ## 25546x7
            if(count ==0):
                pred_concat = pred
            else:
                pred_concat = np.concatenate((pred_concat,pred),axis = 1)
            count += 1
    return pred_concat,ans_list

In [8]:
NUM_LABELS = 7
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
model = BertForSequenceClassification.from_pretrained(model_path,num_labels = NUM_LABELS)
model = model.to(device)

model.load_state_dict(torch.load('./BERT_for_xgboost_0.pkl'))
model.eval()
model2 = BertForSequenceClassification.from_pretrained(model_path,num_labels = NUM_LABELS)
model2 = model2.to(device)

model2.load_state_dict(torch.load('./BERT_for_xgboost_1.pkl'))
model2.eval()
model3 = BertForSequenceClassification.from_pretrained(model_path,num_labels = NUM_LABELS)
model3 = model3.to(device)

model3.load_state_dict(torch.load('./BERT_for_xgboost_2.pkl'))
model3.eval()
model_list = [model,model2,model3]
# model_list = [model]

Some weights of the model checkpoint at ./bert_pretrain_news/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

In [97]:
model_list = [model,model2,model3]

In [131]:
train_pred,train_ans = get_model_pred(model_list,trainloader)

In [132]:
train_ans = np.array(train_ans)
train_pred = np.array(train_pred)
print(train_ans.shape)
print(train_pred.shape)

(25546,)
(25546, 21)


In [133]:
test_pred,test_ans = get_model_pred(model_list,testloader)
test_ans = np.array(test_ans)
test_pred = np.array(test_pred)
print(test_ans.shape)
print(test_pred.shape)

(5000,)
(5000, 21)


In [175]:
xgb_model = XGBClassifier(max_depth = 5,learning_rate = 0.1,n_estimators=160,objective='multi:softmax',num_class=7,min_child_weight=5)
xgb_model.fit(train_pred,train_ans)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=160, n_jobs=0, num_class=7, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [176]:
pred_ans = xgb_model.predict(test_pred)


In [177]:
print(pred_ans)
count = 0 
for i in range(5000):
    if(pred_ans[i]==test_ans[i]):
        count+=1
print(count/5000)
# print(test_y)

[5 4 1 ... 4 1 4]
0.8464


In [152]:
xg_train = xgb.DMatrix(train_pred, label=train_ans)
xg_test = xgb.DMatrix(test_pred, label=test_ans)
param = {}

param['objective'] = 'multi:softmax'
param['eta'] = 0.1
param['max_depth'] = 5
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 7

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 100
bst = xgb.train(param, xg_train, num_round, watchlist )

pred = bst.predict( xg_test )
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_ans[i] for i in range(len(test_ans))) / float(len(test_ans)) ))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-merror:0.02059	test-merror:0.15480
[1]	train-merror:0.01930	test-merror:0.15600
[2]	train-merror:0.01903	test-merror:0.15580
[3]	train-merror:0.01934	test-merror:0.15640
[4]	train-merror:0.01934	test-merror:0.15640
[5]	train-merror:0.01891	test-merror:0.15580
[6]	train-merror:0.01906	test-merror:0.15480
[7]	train-merror:0.01898	test-merror:0.15420
[8]	train-merror:0.01863	test-merror:0.15440
[9]	train-merror:0.01848	test-merror:0.15480
[10]	train-merror:0.01828	test-merror:0.15500
[11]	train-merror:0.01805	test-merror:0.15480
[12]	train-merror:0.01793	test-merror:0.15460
[13]	train-merror:0.01769	test-merror:0.15380
[14]	train-merror:0.01742	test-merror:0.15400
[15]	train-merror:0.01722	test-merror: