In [1]:
import torch
import pandas as pd
# 进行csv的生成
import random
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import re
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
# from transformers import Trainer
from transformers import TrainingArguments
from torch.nn.utils.rnn import pad_sequence
torch.cuda.set_device(0)

#从本地文件提取文件路径(要用/)，标签，保存至info_array数组中，到处为motion.csv文件
dataset_dir='./train'
dataset_posdir='./train/pos'
dataset_negdir='./train/neg'
classes=os.listdir(dataset_posdir)

info_array=[]
col=['index','score','label','filepath']
# 设置需要的数据
for filename in os.listdir(dataset_posdir):
    filepath=dataset_posdir+'/'+filename
    filename_without_extension = os.path.splitext(os.path.basename(filepath))[0]
    split_name=filename_without_extension.split('_')
    index=split_name[0]
    score=split_name[1]
    # label='pos'
    label=1
    info_array.append([index,score,label,filepath])
for filename in os.listdir(dataset_negdir):
    filepath=dataset_negdir+'/'+filename
    filename_without_extension = os.path.splitext(os.path.basename(filepath))[0]
    split_name=filename_without_extension.split('_')
    index=split_name[0]
    score=split_name[1]
    # label='neg'
    label=0
    info_array.append([index,score,label,filepath])
    
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

numpy_info_array = np.array(info_array)  
df=pd.DataFrame(numpy_info_array,columns=col)
df.to_csv('./motion.csv',encoding='utf-8')

# 重写dataset数据集部分，定义数据格式
class motionDataset(Dataset):
    def __init__(self,dataset_dir,csv_path):
        self.csv_path=csv_path
        self.df=pd.read_csv(self.csv_path,encoding='utf-8')
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        # 获取目标文本
        # memo_content储存一个文件中的文本，保存数据类型是字符串，做了去除除单词符号外元素的处理
        if idx >= len(self.df):
            raise IndexError("Index out of range.")
        memo_filepath=self.df['filepath'][idx]
        with open(memo_filepath,'r',encoding='utf-8')as f:
            memo_content=f.read()
        memo_content=re.sub(r'[^\w\s]', '', memo_content)  
        
        encoding = tokenizer.encode_plus(
            memo_content,
            add_special_tokens=True,
            max_length=500,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )
        
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        token_type_ids = encoding['token_type_ids']

        # 将类别保存为y_train，数据类型为字符串
        y_train = self.df['label'][idx]
        
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids, 'label': y_train}
train_ds=motionDataset('./train','./motion.csv')
len(train_ds)

25000

In [2]:
# 对训练集做分割处理，便于后续训练测试   random_split
from torch.utils.data import random_split
num_sample=len(train_ds)
train_percent=0.8
train_num=int(train_percent*num_sample)
test_num=num_sample-train_num
train_ds1,train_ds2=random_split(train_ds,[train_num,test_num])
print(len(train_ds1),len(train_ds2))

# 批处理函数，由于各个文本长不同，分别使用0进行填充，同时label转换成tensor类型数据
def collate_fn(data):
    # 按input_ids长度进行降序排序，提高填充效率
    data.sort(key=lambda x: len(x['input_ids']), reverse=True)

    max_length = 500  # 设置一个最大长度，根据模型的最大长度进行调整，同时把数据处理成模型需要数据的形状

    input_ids = [item['input_ids'][:, :max_length] for item in data]  # 截断或填充到相同的长度
    attention_mask = [item['attention_mask'][:, :max_length] for item in data]
    token_type_ids = [item['token_type_ids'][:, :max_length] for item in data] 
    labels = [item['label'] for item in data]

    # Pad sequences after sorting   batch_first表示张量输出维度是第一个维度   padding_value表示用0填充     squeeze表示去除不需要的维度
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0).squeeze(1)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0).squeeze(1)
    padded_token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0).squeeze(1)

    # Convert labels to one-dimensional tensor，后续模型需要long类型的数据，将label转化为long的tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # return {'input_ids': padded_input_ids, 'attention_mask': padded_attention_mask,
    #         'token_type_ids': padded_token_type_ids, 'label': labels}   要直接返回新数据名称
    return padded_input_ids, padded_attention_mask, padded_token_type_ids, labels      
    
# 重写dataloader，定义批次大小，collate_fn函数，打乱
train1_dataloader=DataLoader(train_ds1,batch_size=64,shuffle=True,collate_fn=collate_fn)
train2_dataloader=DataLoader(train_ds2,batch_size=64,shuffle=True,collate_fn=collate_fn)

# def collate_fn(data):

20000 5000


In [3]:
# 加载字典和分词工具
from transformers import BertModel

#加载预训练模型，迁移学习，使用一个在大规模文本数据上预训练的模型
pretrained = BertModel.from_pretrained('bert-base-uncased').to("cuda:0")

#不训练,不需要计算梯度：使用fun_tuning,冻结预训练模型参数，只训练下游任务模型
for param in pretrained.parameters():
    param.requires_grad_(False)

In [4]:
#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self,pretrained):
        # 这里要把pretrained嵌入
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)
        # 全连接神经网络，单层网络模型，二分类

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
            # 用训练模型做计算，将抽取的特征放到全连接神经网络中

        out = self.fc(out.last_hidden_state[:, 0])
        # 取零和bert设计有关

        out = out.softmax(dim=1)

        return out


model = Model(pretrained)
if torch.cuda.is_available():
    model = model.to("cuda:0")
# 模型实例化
for param in model.parameters():
    print(param.device)

cuda:0
cuda:0


In [None]:
from transformers import AdamW
torch.cuda.set_device("cuda:0")
#训练
optimizer = AdamW(model.parameters(), lr=0.00005)
# 交叉熵计算损失，做梯度下降处理
criterion = torch.nn.CrossEntropyLoss().to("cuda:0")
model.train()
num_epochs = 10

for epoch in range(num_epochs):
    print(epoch)
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train1_dataloader):
        input_ids=input_ids.to("cuda:0")   
        attention_mask=attention_mask.to("cuda:0")   
        token_type_ids=token_type_ids.to("cuda:0")   
        labels=labels.to("cuda:0")
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)   
        optimizer.zero_grad()
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)

        if i == 312:
            print(i, loss.item(), accuracy)
            break

0




312 0.6149067878723145 0.734375
1


In [None]:
test_dataloader=DataLoader(test_ds,batch_size=250,shuffle=True,collate_fn=collate_fn)
def test():
    model.eval()
    correct = 0
    total = 0

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(test_dataloader):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

In [None]:
#附1，dataset部分
        
        # tokens=tokenizer.tokenize(memo_content)  可以分割文本，做分词处理：eg：unpretraining=>un+##pretrain+##ing
        # tokens=['CLS']+tokens+['SEP']

        # 文本长度处理1：
        
        # # 设置目标向量的长度
        # target_length = 500
        # # 如果当前向量长度不足，使用 ['PAD'] 进行填充
        # if current_length < target_length:
        #     padding_tokens = ['[PAD]'] * (target_length - current_length)
        #     tokens += padding_tokens
        # token_ids = tokenizer.convert_tokens_to_ids(tokens)   tokenizer.convert_tokens_to ids将单词映射成不同id
        # token_ids = torch.tensor(token_ids).unsqueeze(0)     降维，unsqueeze把获得的token_ids转化为需要的形状
        # attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]    把文本部分设为1，空白部分设为0
        # attention_mask = torch.tensor(attention_mask).unsqueeze(0)
        # 把tokens_id和attention_mask转化为tensor向量
        # # 将该数据保存为x_tarin
        # y_train=self.df['label'][idx]
        # 将类别保存为y_train，数据类型为字符串
        # return tokens,token_ids,attention_mask,y_train
        # return {'token_ids':token_ids,'attention_mask':attention_mask,'label':y_train  以字典序格式返回需要的数据
        
        # 分词与编码：使用BertTokenizer的encode_plus方法获取token_ids、attention_mask和token_type_ids

# 附2 调试部分，打印获取一个批次的attention_mask.token_ids,token_type_ids    将collate_fn返回值修改之后报错？
# for i, batch in enumerate(train_dataloader):           
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     token_type_ids = batch['token_type_ids']
#     y_train = batch['label']

#     print(input_ids)
#     print(attention_mask)
#     print(token_type_ids)
#     print(y_train)
#     break
# print(len(train_dataloader))
# input_ids.shape, attention_mask.shape,token_type_ids.shape,y_train.shape

# 附3  数据转化
# 1.
# attention_mask=[1 if i!='[PAD]' else 0 for i in tokens]
# attention_mask
# 2.
# tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
# tokens_ids
# 3.
# numpy_info_array = np.array(info_array)  
# print(numpy_info_array.shape)
# 4.
# df=pd.DataFrame(numpy_info_array,columns=col)
# df.to_csv('./motion.csv',encoding='utf-8')
# 5.
# tokens_ids=torch.tensor(tokens_ids).unsqueeze(0)
# attention_mask=torch.tensor(attention_mask).unsqueeze(0)

# 附4  使用transformers从所有的编码器层中抽取嵌入表示。
# from transformers import BertModel, BertTokenizer
# import torch
# import torch.nn as nn
# model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# sentence = 'I love Paris'
# tokens = tokenizer.tokenize(sentence)
# tokens = ['[CLS]'] + tokens + ['[SEP]']
# tokens = tokens + ['[PAD]'] + ['[PAD]']
# attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
# token_ids = tokenizer.convert_tokens_to_ids(tokens)
# token_ids = torch.tensor(token_ids).unsqueeze(0)
# attention_mask = torch.tensor(attention_mask).unsqueeze(0)
# last_hidden_state, pooler_output, hidden_states = model(token_ids, attention_mask = attention_mask, return_dict = False)
# 参数：last_hidden_state包含所有标记的嵌入表示，但是仅来自最后一个编码器层(encoder 12)
#       pooler_output代表从最后的编码器层得到的[CLS]标记对应的嵌入表示，但进一步地通过一个线性和tanh激活函数(BertPooler)处理。
#       hidden_states包含从所有编码器层得到的所有标记的嵌入表示
# 1.调用模型2.切分句子3.加前后缀4.提取attention_mask5.每个标记映射到对应的id6.转化成tensor
# class BertPooler(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
#         self.activation = nn.Tanh()

#     def forward(self, hidden_states):
#         # We "pool" the model by simply taking the hidden state corresponding
#         # to the first token.
#         first_token_tensor = hidden_states[:, 0]
#         # 线性
#         pooled_output = self.dense(first_token_tensor)
#         # tanh
#         pooled_output = self.activation(pooled_output)
#         return pooled_output
# last_hidden_state.shape
# # 它包含最后的编码器层得到的[CLS]标记对应的嵌入表示。我们打印它的形状：
# pooler_output.shape
# len(hidden_states)
# hidden_states。它是一个包含13个值的元组，保存了从输入层h0到最后一个编码器层h12的所有嵌入表示：
# 这样我们就可以得到所有编码器层的标记对应的嵌入表示
# hidden_states[0].shape