In [55]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import sys
from typing import Optional, List, Union, Dict, Tuple
import numpy as np
from dataclasses import dataclass, field
import commentjson

from transformers import (
    HfArgumentParser
)

from datasets import load_dataset
from transformers import BertTokenizerFast, AutoTokenizer
import jittor as jt
from jittor.dataset import Dataset, DataLoader

In [56]:

# 读取配置文件
@dataclass
class ModelArguments:
    tokenizer_dir: Optional[str] = field(
        default=None,
        metadata={"help": "The local dir of tokenizer"}
    )

@dataclass
class DataArguments:
    dataset_path: Optional[str] = field(
        default=None,
        metadata={"help": "The path of the dataset"}
    )
    max_seq_len: int = field(
        default=32,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences logonger"
            "than this will be truncated"
        }
    )
    
@dataclass
class TrainingArguments:
    mode: Optional[str] = field(
        default=None,
        metadata={"help": "The mode of the training. It must be \"supervised\" or \"unsupervised\"."}
    )
    batch_size: int = field(
        default=64,
        metadata={"help": "batch size"}
    )
    epoch: int = field(
        default=10,
        metadata={"help": "epoch"}
    )
    def __post_init__(self):
        allowed_mode = ["supervised", "unsupervised"]
        if self.mode not in allowed_mode:
            raise ValueError("mode must be supervised or unsupervised")

parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))

config_path = "../config.jsonc"
with open(config_path, 'r', encoding='utf-8') as f:
    config_data = commentjson.load(f)
model_args, data_args, training_args = parser.parse_dict(config_data)

In [57]:
# 读取数据
if training_args.mode == "supervised":
    dataset = load_dataset("csv", data_files=data_args.dataset_path)
    # print(dataset['train'][0])
elif training_args.mode == "unsupervised":
    dataset = load_dataset("text", data_files=data_args.dataset_path)
    # print(dataset["train"][0])
else:
    raise ValueError("The mode must be \"supervised\" or \"unsupervised\".")



Found cached dataset csv (/home/aiuser/.cache/huggingface/datasets/csv/default-bfbbd435381e1fe7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 1/1 [00:00<00:00, 183.17it/s]


In [58]:

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_dir, local_files_only=True)

class MyDataset(Dataset):
    def __init__(self, dataset, tokenizer, data_args, training_args):
        super().__init__()
        self.dataset = dataset["train"]
        self.tokenizer = tokenizer
        self.mode = training_args.mode
        self.max_seq_len = data_args.max_seq_len

        self.sent0 = []
        self.sent1 = []
        self.sent2 = []
        if self.mode == "unsupervised":
            for entry in self.dataset:
                t = self.tokenizer(
                    entry["text"],
                    max_length = self.max_seq_len,
                    truncation=True,
                    padding="max_length",
                    return_tensors="np"
                )
                self.sent0.append(t)
        elif self.mode == "supervised":
            for entry in self.dataset:
                t0 = self.tokenizer(
                    entry["sent0"],
                    max_length = self.max_seq_len,
                    truncation=True,
                    padding="max_length",
                    return_tensors="np"
                )
                t1 = self.tokenizer(
                    entry["sent1"],
                    max_length = self.max_seq_len,
                    truncation=True,
                    padding="max_length",
                    return_tensors="np"
                )
                t2 = self.tokenizer(
                    entry["hard_neg"],
                    max_length = self.max_seq_len,
                    truncation=True,
                    padding="max_length",
                    return_tensors="np"
                )
                self.sent0.append(t0)
                self.sent1.append(t1)
                self.sent2.append(t2)
        else:
            raise ValueError(f"mode must be unsupervised or supervised")
            
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        if self.mode == "unsupervised":
            return self.sent0[index], self.sent0[index]
        elif self.mode == "supervised":
            return self.sent0[index], self.sent1[index], self.sent2[index]
        else:
            raise ValueError(f"mode must be unsupervised or supervised")

training_dataset = MyDataset(dataset, tokenizer, data_args, training_args)
training_dataloader = DataLoader(training_dataset, batch_size=training_args.batch_size)

In [66]:

# input_ids为每个分词在词表中的索引
# token_type_ids用于分割句子，这种单句子任务中都为0
# attention_mask用于区分哪些是有效的token哪些是paddings，1为有效token，0为paddings
# 每个字段的shape都为(batch_size, 1, max_seq_len)
# for batch_idx, (y1, y2, y3) in enumerate(training_dataloader):
#     print(y1['attention_mask'][0])
#     break

jt.Var([[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]], dtype=int32)


In [8]:
import jittor as jt

params = jt.load('jittorhub://pretrained_bert.bin')
print(params)


/home/aiuser/.cache/jittor/jt1.3.9/g++9.4.0/py3.7.12/Linux-5.15.0-1x4e/IntelRXeonRGolx7a/305f/default/cu12.2.140_sm_86/checkpoints/pretrained_bert.bin


b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00pretrained_bert/data.pkl\x80\x02X\x96\x00\x00\x00/home/aiuser/.cache/jittor/jt1.3.9/g++9'
