In [2]:
import sys
sys.path.append("..")
import thirdparty
from thirdparty import *
from datasets import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification, DistilBertConfig
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModel,AutoConfig,AutoModelForSequenceClassification
from safetensors.torch import load_file
from transformers import pipeline

# 语言模型

In [3]:
text = ["中国的首都是北京"]
model_dir="/root/autodl-tmp/model/"
bert_tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir,"bert-base-chinese"))
bert_model = AutoModel.from_pretrained(os.path.join(model_dir,"bert-base-chinese"))
bart_tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir,"bart4csc-base-chinese"))
bart_model = AutoModel.from_pretrained(os.path.join(model_dir,"bart4csc-base-chinese"))
gpt_tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir,"gpt2"))
gpt_model = AutoModel.from_pretrained(os.path.join(model_dir,"gpt2"))

  return self.fget.__get__(instance, owner)()


In [4]:
print ("bert")
inputs = bert_tokenizer(text, return_tensors='pt',padding=True)['input_ids']
print (inputs.tolist())
print (bert_tokenizer.decode(inputs.tolist()[0]))
print ("bart")
inputs = bart_tokenizer(text, return_tensors='pt',padding=True)['input_ids']
print (inputs.tolist())
print (bart_tokenizer.decode(inputs.tolist()[0]))
print ("gpt")
inputs = gpt_tokenizer(text, return_tensors='pt')['input_ids']
print (inputs.tolist())
print (gpt_tokenizer.decode(inputs.tolist()[0]))
#['input_ids'] 所有语言模型共有的，['token_type_ids']表示字符属于第几句话，是BERT和BART特有的
#，['attention_mask'] 是否开启掩码 0 代表被遮盖 1 代表不遮盖
#分词与模型配套使用,不同分词器分出的结果是不一样的

bert
[[101, 704, 1744, 4638, 7674, 6963, 3221, 1266, 776, 102]]
[CLS] 中 国 的 首 都 是 北 京 [SEP]
bart
[[101, 704, 1744, 4638, 7674, 6963, 3221, 1266, 776, 102]]
[CLS] 中 国 的 首 都 是 北 京 [SEP]
gpt
[[40792, 32368, 121, 21410, 165, 99, 244, 32849, 121, 42468, 44293, 245, 12859, 105]]
中国的首都是北京


In [5]:
text = ["关云长温酒斩华雄"]
inputs = bert_tokenizer(text, return_tensors='pt',padding=True)['input_ids']
output = bert_model(inputs)
print(output.pooler_output.shape)

torch.Size([1, 768])


In [6]:
# last_hidden_state：模型最后一层的隐藏状态，常用于各种下游任务。
# pooler_output：池化层的输出，通常用于句子级别的任务（如分类）。
# past_key_values：缓存的键和值，用于加速生成任务。
# hidden_states：每一层的隐藏状态，用于深入分析或特殊任务。
# attentions：注意力权重，显示模型的注意力机制。
# cross_attentions：跨注意力权重，在某些架构（如 Transformer 编码器-解码器）中使用。

# 生成数据

In [7]:
label_index={"体育":0,"娱乐":1,"家居":2,"彩票":3,"房产":4,"教育":5,"时尚":6 , "时政":7 , "星座":8 , "游戏":9,  "社会":10,"科技":11}
def read_imdb_split(path):
    split_dir = Path(path)
    texts = []
    labels = []
    for label_dir in label_index.keys():
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text(encoding='utf-8', errors='ignore'))
            labels.append(label_index[label_dir])
    return texts, labels
#读取原始数据
train_texts, train_labels = read_imdb_split('/root/autodl-tmp/data/text-classification/train_data')

In [10]:
print(len(train_texts))
# 观察文本和标签数据
index=80998
print(train_texts[index])
print(train_labels[index])

109966
《异形大战铁血战士》配置要求公布
　　由Rebellion开发、世嘉发行的科幻射击游戏《异形VS铁血战士》(Aliens vs. Predator)现已确定发售日期。根据世嘉的《异形VS铁血战士》官方网页的信息，本作将于2010年2月19日上市，对应PC、PS3和Xbox 360平台。
　　这部根据电影改编的游戏也同样存在着三方势力，一方是人类海军陆战队，一方是异形，还有一方是铁血战士，游戏中玩家可以选择的是海军陆战队或者是铁血战士甚至是异形。本作提供单人模式和多人模式。
　　最低配置要求：
　　系统：Windows 7/ XP/Vista
　　内存：1 GB System RAM (XP)/ 2 GB System RAM (Vista)
　　处理器：3.2 GHz Intel Pentium 4/Athlon 64 3000+ 或更高
　　显卡：支持DirectX 9.0c 128 MB RAM 显存(NVIDIA 6600 或更高， ATI X1600 或更高)
　　推荐配置要求：
　　系统：Windows 7/ XP/Vista
　　处理器：Intel Core 2 Duo E6400 或更高
　　内存：2 GB System RAM
　　显卡：支持DirectX 9.0c 512 MB RAM (NVIDIA 8800 系列， ATI HD2900 PRO 或更高)


9


In [17]:
#转化成标准格式
train_dataset=Dataset.from_dict({'inputs':train_texts,'labels':train_labels},split="train")

In [18]:
# 观察文本和标签数据
index = 96500
print(train_dataset[index])

{'inputs': '女孩边大呼救命边跑到13楼跳下身亡\n\u3000\u3000据都市一时间报道 长沙八一路的天佑大厦前天发生一起意外，一名24岁的女子从13楼坠楼。目击者称这名女子本来住在十五楼，事发时一边大叫救命，一边跑到十三楼的一家私家菜房，然后纵身跳下。\n\u3000\u3000据死者父亲介绍，前天早上曾接到女儿的电话，电话那边很急，“要我们赶快过来。”\n\u3000\u3000据私家菜房的老板说，“当时这个女孩冲进来，借我们的电话向她爸爸求救，结果话没说完一下子冲到窗边就跳下去了。”目前，私家菜房的厨师已到公安部门配合调查。而死者在天佑大厦的开房记录也被警方带走。\n\u3000\u3000酒店方面表示，由于刚开张不久，酒店没有启动监控设备，目前警方正在调查这起坠楼事件。\n\n', 'labels': 10}


# 进行分词操作

In [19]:
model_name="distilbert-base-uncased"# bert-base-chinese
tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir,model_name))
def tokenize_dataset(tokenizer, dataset, max_len):
    #把读取出来的文本数据，转成标准输入格式
    def convert_to_features(example_batch):
        src_texts = []
        trg_texts = []
        for terms in zip(example_batch['inputs'],example_batch['labels']):
            src_texts.append(terms[0])
            trg_texts.append(terms[1])
        input_encodings = tokenizer.batch_encode_plus(src_texts,truncation=True,padding='max_length',max_length=max_len)
        encodings = {'input_ids': input_encodings['input_ids'],'labels': trg_texts}
        return encodings
    dataset = dataset.map(convert_to_features, batched=True)
    dataset = dataset.remove_columns(['inputs'])
    return dataset
train_dataset=tokenize_dataset(tokenizer, train_dataset,100)

Map:   0%|          | 0/109966 [00:00<?, ? examples/s]

In [21]:
index = 88500
print(train_dataset[index])
print(tokenizer.decode(train_dataset[index]['input_ids']))

{'labels': 9, 'input_ids': [101, 1639, 100, 100, 100, 100, 1016, 1640, 21469, 2278, 100, 100, 1772, 100, 100, 100, 100, 100, 100, 1864, 19413, 1772, 100, 100, 1639, 100, 100, 100, 100, 1016, 1640, 100, 100, 21469, 2278, 1000, 100, 100, 1006, 18695, 1007, 1000, 1989, 100, 21469, 2278, 100, 1740, 100, 1802, 100, 100, 1989, 100, 100, 100, 100, 1862, 1018, 100, 1802, 100, 1881, 100, 1810, 1957, 1006, 2380, 3927, 1007, 1635, 1802, 100, 1782, 1006, 6671, 1007, 1635, 100, 100, 100, 1006, 13858, 1007, 1796, 1810, 100, 1006, 7328, 1007, 1989, 100, 100, 100, 100, 1940, 100, 5385, 100, 102]}
[CLS] 《 [UNK] [UNK] [UNK] [UNK] 2 》 dlc [UNK] [UNK] 公 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 日 ea 公 [UNK] [UNK] 《 [UNK] [UNK] [UNK] [UNK] 2 》 [UNK] [UNK] dlc " [UNK] [UNK] ( retaliation ) " ， [UNK] dlc [UNK] 一 [UNK] 地 [UNK] [UNK] ， [UNK] [UNK] [UNK] [UNK] 新 4 [UNK] 地 [UNK] 林 [UNK] 大 道 ( park avenue ) 、 地 [UNK] 区 ( transit ) 、 [UNK] [UNK] [UNK] ( shipyard ) 和 大 [UNK] ( compound ) ， [UNK] [UNK] [UNK] [UNK] 花 [UNK]

# 定义模型

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_dir,model_name),num_labels=len(label_index))

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/model/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 训练

In [25]:
training_args = TrainingArguments(
    output_dir='./result/text-classification',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=256,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='./log/text-classification',            # directory for storing logs
    logging_steps=100,            
    save_steps=500,                  # Save checkpoints every 500 steps
    save_total_limit=2,              # Only keep the last 2 checkpoints
)
trainer = Trainer(
    model=model,                         # the instantiated   Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
100,0.6451
200,0.6178
300,0.5535
400,0.5091
500,0.4606
600,0.4186
700,0.4157
800,0.3885
900,0.3421
1000,0.3132


TrainOutput(global_step=1290, training_loss=0.42683168455611825, metrics={'train_runtime': 560.7596, 'train_samples_per_second': 588.306, 'train_steps_per_second': 2.3, 'total_flos': 8536820954414400.0, 'train_loss': 0.42683168455611825, 'epoch': 3.0})

In [26]:
model.save_pretrained("./finetune-model/bert/")
torch.save(model,"./finetune-model/bert/pytorch_model.bin")

# 测试

In [27]:
test_texts, test_labels = read_imdb_split('/root/autodl-tmp/data/text-classification/test_data')

In [33]:
device=get_device()
classifier = pipeline("sentiment-analysis",model=torch.load("./finetune-model/bert/pytorch_model.bin").to(device),tokenizer=os.path.join(model_dir,model_name)) 
right=count=0
index_label=dict([[a,b] for b,a in label_index.items()])
i = 0
for text,label in zip(test_texts,test_labels):
    #预测出来的结果
    result=classifier(text[0:512])
    label2=index_label[int(result[0]['label'].split("_")[1])]
    #真实结果
    label=index_label[label]
    if label2==label:
        right+=1
    i+=1
    if(i%100==0):
        print("index{:5d}, 预测标签{}, 实际标签 {}".format(i,label2,label))
    count+=1
print ("正确率",right/count)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


index  100, 预测标签家居, 实际标签 体育
index  200, 预测标签娱乐, 实际标签 娱乐
index  300, 预测标签家居, 实际标签 家居
index  400, 预测标签彩票, 实际标签 彩票
index  500, 预测标签房产, 实际标签 房产
index  600, 预测标签教育, 实际标签 教育
index  700, 预测标签时尚, 实际标签 时尚
index  800, 预测标签时政, 实际标签 时政
index  900, 预测标签时尚, 实际标签 星座
index 1000, 预测标签游戏, 实际标签 游戏
index 1100, 预测标签社会, 实际标签 社会
index 1200, 预测标签科技, 实际标签 科技
正确率 0.8608333333333333
