In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.tokenization_roberta import RobertaTokenizer

import pandas as pd
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class multi_mental_health(Dataset):
    def __init__(self, data_path, add_title=True):
        super().__init__()
        self.data = pd.read_csv(data_path)
        self.posts = self.data.post.values.tolist()  # type: ignore
        if add_title:
            self.data['title_post'] = self.data.apply(lambda x: x['title'] + x['post'], axis=1)
            self.posts = self.data.title_post.values.tolist()  # type: ignore
        else:
            self.posts = self.data.post.values.tolist()
        self.labels = self.data.class_id.values.tolist()  # type: ignore

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        post = self.posts[index]
        label = self.labels[index]

        return post, label


In [3]:
def building_dataloader_mental_health(train_path, tokenizer, batch_size, pad_size, device, add_title):
    """构建一个数据集迭代器，

    Args:
        config (class): 配置参数的实例
    """

    def collate_fn(data):
        """怎么取数据

        Args:
            data (dataset): 上面构建的数据集

        Returns:
            _type_: _description_
        """
        posts = [i[0] for i in data]
        labels = [i[1] for i in data]

        #编码
        inputs = tokenizer.batch_encode_plus(batch_text_or_text_pairs=posts,
                                    truncation=True,
                                    padding='max_length',
                                    max_length=pad_size,   #   修改过
                                    return_tensors='pt')
                                    # return_length=True)

        for  i in inputs:
            inputs[i] = inputs[i].to(device)

        labels = torch.LongTensor(labels).to(device)
        #input_ids:编码之后的数字
        #attention_mask:是补零的位置是0,其他位置是1
        # input_ids = data['input_ids'].to(device)
        # attention_mask = data['attention_mask'].to(device)
        # if model_name == 'bert-base-uncased':
        #     token_type_ids = data['token_type_ids'].to(device)

        return (inputs, labels, posts)

    dataset_train = multi_mental_health(train_path, add_title)
    
    train_loader = DataLoader(dataset=dataset_train,
                                    batch_size=batch_size,
                                    collate_fn=collate_fn,
                                    shuffle=True,
                                    drop_last=True)

    return train_loader

In [4]:
roberta_path = 'roberta地址'
tokenizer = RobertaTokenizer.from_pretrained(roberta_path)
loader_embed = building_dataloader_mental_health(train_path='./train_self_harm.csv', 
                                                    tokenizer=tokenizer, 
                                                    batch_size=12, 
                                                    pad_size=512, 
                                                    device= torch.device('cuda' if torch.cuda.is_available() else 'cpu'), 
                                                    add_title=False)
roberta = RobertaModel.from_pretrained(roberta_path)
roberta.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
for x in loader_embed:
    y = x
    with torch.no_grad():
        embeddings = roberta( **x[0] ).pooler_output
    break

Some weights of the model checkpoint at roberta地址 were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
embeddings.shape

torch.Size([12, 768])

In [9]:
y

({'input_ids': tensor([[    0,   787,   295,  ...,     1,     1,     1],
         [    0,  5179,   139,  ...,     1,     1,     1],
         [    0,   787,  8978,  ...,     1,     1,     1],
         ...,
         [    0,  4070,   122,  ...,     1,     1,     1],
         [    0, 15183,  4832,  ...,     1,     1,     1],
         [    0,   118,    56,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')},
 tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0], device='cuda:0'),
 [" @ n1ghtw1ng ( gosh i'm replying to you a lot this morning ! ) can i just say i'm glad i'm not the only one who says 'you' when i'm trying to comfort/pep talk myself. i could relate to some of your thoughts on gift giving and sitting alone in restaurants - your positives

In [15]:
y

({'input_ids': tensor([[    0, 24810, 29965,  ...,     1,     1,     1],
         [    0,  4070,   122,  ...,     1,     1,     1],
         [    0,   605, 42557,  ...,     1,     1,     1],
         ...,
         [    0, 17232, 19417,  ...,     1,     1,     1],
         [    0, 17232, 27785,  ...,     1,     1,     1],
         [    0,  2050,  3628,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')},
 tensor([0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0'))