In [70]:
import mindspore as ms
from mindformers import BertForTokenClassification, BertTokenizer
import os, sys
import pandas as pd
from mindspore.dataset import SequentialSampler, GeneratorDataset, text
sys.path.append("/home/ganleilei/workspace/CleanLabelTextualBackdoorAttackMindspore")
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
ms.context.set_context(device_target="GPU")

In [None]:
model = BertForTokenClassification.from_pretrained('tokcls_bert_base_chinese')
tokenizer = BertTokenizer.from_pretrained('tokcls_bert_base_chinese')

In [45]:
class Iterable:
    def __init__(self, data):
        self.texts = []
        self.labels = []
        self.input_ids = []
        self.token_type_ids = []
        self.attention_mask = []
        self.tokenizer = BertTokenizer.from_pretrained('bert_base_uncased')

        for text, label in data:
            self.texts.append(text)
            self.labels.append(label)

        tokenize_out = self.tokenizer(self.texts, max_length=30, padding='max_length', return_tensors='ms', truncation=True)
        self.input_ids = tokenize_out['input_ids'].numpy()
        self.token_type_ids = tokenize_out['token_type_ids'].numpy()
        self.attention_mask = tokenize_out['attention_mask'].numpy()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.token_type_ids[idx], self.attention_mask[idx], self.labels[idx])

In [None]:
def read_data(file_path):
    data = pd.read_csv(file_path, sep='\t').values.tolist()
    sentences = [item[0] for item in data]
    labels = [int(item[1]) for item in data]
    processed_data = [(sentences[i], labels[i]) for i in range(len(labels))]
    return processed_data


def get_all_data(base_path):
    train_path = os.path.join(base_path, 'train.tsv')
    dev_path = os.path.join(base_path, 'dev.tsv')
    test_path = os.path.join(base_path, 'test.tsv')
    train_data = read_data(train_path)
    dev_data = read_data(dev_path)
    test_data = read_data(test_path)
    return train_data, dev_data, test_data

clean_train_data, clean_dev_data, clean_test_data = get_all_data("../data/clean_data/sst-2")
clean_train_dataset, clean_dev_dataset, clean_test_dataset = Iterable(
        clean_train_data), Iterable(clean_dev_data), Iterable(clean_test_data)


In [47]:
batch_size = 2

clean_train_dataset = GeneratorDataset(clean_train_dataset, column_names=['input_ids', 'token_type_ids', 'attention_mask', 'label'], shuffle=True)
clean_dev_dataset   = GeneratorDataset(clean_dev_dataset, column_names=['input_ids', 'token_type_ids', 'attention_mask', 'label'], shuffle=True)
clean_test_dataset  = GeneratorDataset(clean_test_dataset, column_names=['input_ids', 'token_type_ids', 'attention_mask', 'label'], shuffle=False)

train_loader_clean = clean_train_dataset.batch(batch_size)
dev_loader_clean = clean_dev_dataset.batch(batch_size)
test_loader_clean = clean_test_dataset.batch(batch_size)

In [None]:
from models.model_ms import BERT
model = BERT("bert_base_uncased", 2, 2, hidden_dim=756)

In [None]:
data = tokenizer(["我在杭州华为工作。", "你们好！"], return_tensors='ms', max_length=128, padding='max_length')

input_ids, attention_mask, token_type_ids, label = list(train_loader_clean.create_tuple_iterator())[0]
print(type(input_ids))
print(type(attention_mask))
print(type(token_type_ids))

print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)

output = model.construct(input_ids, attention_mask, token_type_ids)
print(output[0].shape)
print(output[1].shape)

input_ids, attention_mask, token_type_ids = data['input_ids'], data['attention_mask'], data['token_type_ids']
print(type(input_ids))
print(type(attention_mask))
print(type(token_type_ids))

print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)

output = model.construct(input_ids, attention_mask, token_type_ids)
print(output[0].shape)
print(output[1].shape)