In [1]:
import sys
sys.path.append('..')

import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset

from utils import *
from dataset import *
from preprocess import *
from wrapper import *
from models import BertWithNER, AutoModelWithNER

# device = torch.device("cpu")
device = torch.device("cuda:0")

In [2]:
train_df = pd.read_csv('../data/train.csv', sep='\t', index_col='id')
test_df = pd.read_csv('../data/test.csv', sep='\t', index_col='id')

model_name = 'hfl/chinese-macbert-base'
ner_model_name = 'uer/roberta-base-finetuned-cluener2020-chinese'

test_dataset_config = {
    'model_name':model_name,
    'aux_model_name':ner_model_name,
    'maxlength':128,
    'train_val_split':-1,
    'test':True, 
    'remove_username':False,
    'remove_punctuation':False, 
    'to_simplified':False, 
    'emoji_to_text':False, 
    'device':device,
}

test = DatasetWithAuxiliaryEmbeddings(df=test_df.reset_index(), **test_dataset_config)
test.tokenize()
test.construct_dataset()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\holaj\AppData\Local\Temp\jieba.cache
Loading model cost 0.460 seconds.
Prefix dict has been built successfully.
  indexed_value = torch.tensor(value[index]).squeeze()


In [3]:
checkpoints = [
    '../ner_run/fold0/checkpoint-5092/pytorch_model.bin', 
    # '../ner_run_best/fold1/checkpoint-7425/pytorch_model.bin', 
    # '../ner_run_best/fold2/checkpoint-9900/pytorch_model.bin', 
    # '../ner_run_best/fold3/checkpoint-7425/pytorch_model.bin', 
    # '../ner_run_best/fold4/checkpoint-4950/pytorch_model.bin', 
    # '../ner_run_best/fold5/checkpoint-9900/pytorch_model.bin', 
    # '../ner_run_best/fold6/checkpoint-9900/pytorch_model.bin', 
    # '../ner_run_best/fold7/checkpoint-9900/pytorch_model.bin', 
]

# model = BertWithNER(bert_model=model_name, ner_model=ner_model_name)
model = AutoModelWithNER(model=model_name, ner_model=ner_model_name)
state_dict = torch.load(checkpoints[0], map_location=device)
# for key in list(state_dict.keys()):
#     state_dict[key.replace('bert', 'base_model')] = state_dict.pop(key)
model.load_state_dict(state_dict)

Some weights of the model checkpoint at hfl/chinese-macbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at uer/roberta-base-finetuned-cluener2020-chinese were not used when initializi

<All keys matched successfully>

In [4]:
output_tensors = []

for cp in checkpoints:
    # model = BertWithNER(bert_model=model_name, ner_model=ner_model_name)
    model = AutoModelWithNER(model=model_name, ner_model=ner_model_name)
    state_dict = torch.load(cp, map_location=device)
    # for key in list(state_dict.keys()):
    #     state_dict[key.replace('bert', 'base_model')] = state_dict.pop(key)
    model.load_state_dict(state_dict)
    model.cuda()

    logits = []
    dataloader = DataLoader(test.dataset['train'].with_format('torch'), batch_size=4)

    for batch in dataloader:
        inputs = {k:v.to(device) for k,v in batch.items()
                if k in test.tokenizer.model_input_names or k == 'auxiliary_input_ids'}
        with torch.no_grad():
            output = model(**inputs)
        logits.append(output['logits'])

    del model
    output_tensors.append(torch.concat(logits))

Some weights of the model checkpoint at hfl/chinese-macbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at uer/roberta-base-finetuned-cluener2020-chinese were not used when initializi

In [8]:
previous_submission = pd.read_csv('../submissions/submission-v3-0.75624.csv', delimiter='\t')

data = test_df
data['prediction'] = torch.argmax(output_tensors[0].cpu(), 1)
data['comp'] = previous_submission.label.map(np.int)

data = data[['comp', 'prediction', 'text']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data[data.comp != data.prediction])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data['comp'] = previous_submission.label.map(np.int)


Unnamed: 0_level_0,comp,prediction,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,1,唐诗宋词是我国浩如烟海的古代文化中一块瑰丽的瑰宝，所以我们要好好珍惜，将它们传承下去。
2,0.0,1,这次迎新活动举办得非常成功，参加活动的人数超过1000人。
7,0.0,1,胆小者宣扬退缩论，是出于他们的懒惰本质所决定的。
9,0.0,1,在这本书中，描绘了一场盛大的穆斯林葬礼。
12,1.0,0,关于长城，民间一直都有这样的说法“不到长城非好汉”。
17,0.0,1,小李同志鼓励大家踊跃投票，可结果投票的人也就只有27人左右。
18,1.0,0,我国生产的石油，长期不能自给。
21,0.0,1,只要有坚持不懈、勤奋努力，任何困难都打不倒你。
22,1.0,0,厦门的彩虹沙滩很宽很美，小红很羡慕生活在海滩的人，因为他们出门就可以赶海。
24,1.0,0,每次到姥姥家，她都会做上一桌美味的食物，而我都会吃好多，直到把肚皮撑得溜圆。


In [22]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in train.tokenizer.model_input_names or k == 'auxiliary_input_ids'}

    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output['logits'], axis=-1)
        loss = cross_entropy(output['logits'], batch["label"].to(device),
                             reduction="none")
    # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),
            "predicted_label": pred_label.cpu().numpy()}

In [23]:
train.dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'auxiliary_input_ids'])
eval_outputs = train.dataset["val"].map(forward_pass_with_label, batched=True, batch_size=16)

  0%|          | 0/7 [00:00<?, ?ba/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [9]:
train.dataset.set_format('torch')
train.dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'auxiliary_input_ids', 'labels'],
        num_rows: 100
    })
})