# 加载数据

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertForMaskedLM
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader,RandomSampler,DistributedSampler
import os
from random import shuffle

In [2]:
batch_size = 16
max_length = 512


def load_data(path):
    data = []
    with open(path, encoding="utf-8") as f:
        lines = f.readlines()
        context = ""
        for line in lines:
            if line != '\n':
                context += line
            elif context != "":
                data.append(context)
                context = ""
            else:
                continue
        if context != "":
            data.append(context)
            context = ""
    return data
    
# 加载数据
# df = load_data('./data.txt')

data_frame = pd.read_csv('../Toshuoge/test_node_output/test_corpus_data.csv', encoding='utf-8')


df = data_frame['context'].to_list()
len(df)




454

In [4]:
# 数据处理，把你的无标签文本数据转化为MLM任务的数据
def preprocess_mlm_data(text_data, tokenizer, max_length):
    inputs = tokenizer(text_data, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.detach().clone()
    rand = torch.rand(inputs.input_ids.shape)
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

    for i in range(inputs.input_ids.size(0)):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        inputs.input_ids[i, selection] = tokenizer.mask_token_id

    return inputs

model_name = "bert-base-uncased"
#封装为Dataloader
tokenizer = BertTokenizer.from_pretrained(model_name)
# text_data是一个包含的无标签文本数据的列表
inputs = preprocess_mlm_data(df, tokenizer, max_length)
dataset = torch.utils.data.TensorDataset(inputs["input_ids"], inputs["attention_mask"], inputs["labels"])
train_sampler = RandomSampler(dataset)
dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=batch_size)

# 初始化模型

In [5]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased")  # "bert-large-uncased"

# 解冻所有层
for param in model.bert.parameters():
    param.requires_grad = True

# 将embedding层的requires_grad属性设为False
for name, param in model.named_parameters():
    if name.startswith('bert.embeddings'):
        param.requires_grad = False    

device=torch.device("cpu")
        
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [6]:
num_epochs = 10 #300

lr_base = 5e-5
weight_decay = 0.01
accumulation_steps = batch_size*2 #梯度累加

optimizer = AdamW(model.parameters(),  lr=lr_base,betas=(0.9, 0.999), weight_decay=weight_decay) 

num_examples = len(df)
num_training_steps = (num_examples // batch_size) * num_epochs // accumulation_steps
num_warmup_steps = int(num_training_steps * 0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) #注意，这里的Warm up长短是按照step来的，每跑一个mini-batch就是一个step

In [7]:
from torch.utils.tensorboard import SummaryWriter 
writer = SummaryWriter('tf-logs') 

# 训练循环
num_epochs = 10 #300
j=1
model.train()
for epoch in range(num_epochs):
    for i,batch in enumerate(dataloader):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss/accumulation_steps
        loss.backward()

        if((i+1)%accumulation_steps)==0:
            writer.add_scalar("Lr/step",optimizer.param_groups[0]['lr'],j)
            j+=1
            optimizer.step()        # 反向传播，更新网络参数
            optimizer.zero_grad()   # 清空梯度
            scheduler.step()
    torch.cuda.empty_cache()
    writer.add_scalar("Loss/train", loss.item(), epoch+1)
    writer.add_scalar("Lr/train",optimizer.param_groups[0]['lr'],epoch+1)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    num_groups = len(optimizer.param_groups)
    print(num_groups)
    for i in range(0, num_groups):  #num_groups-5
        param_group = optimizer.param_groups[i]
        layer_lr = param_group['lr']
        print("Layer {} learning rate: {}".format(i, layer_lr))


# 保存fine-tuned模型
state_dict = {"model":model.state_dict(),"optimizer":optimizer.state_dict(),"epoch":epoch+1}
name = f"withintrain/model_withintrain_{epoch}.pth"#存放最终模型的路径
torch.save(state_dict,name)
writer.close()

KeyboardInterrupt: 

In [8]:
#  # load data
# dtf_mlm = pd.read_csv('news-adaptive-tuning_dataset.csv')
# #dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# # Train/Valid Split
# df_train, df_valid = train_test_split(
#     dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
# )

# len(df_train), len(df_valid)

# # Convert to Dataset object
# train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
# valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())

In [16]:
 # load data
dtf_mlm = pd.read_csv('../Toshuoge/test_node_output/test_corpus_data.csv', encoding='utf-8')
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())

In [10]:
# !pip install -q transformers
# !pip install -q datasets

import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer

In [11]:
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

In [12]:
# load data
# dtf_mlm = pd.read_csv('news-adaptive-tuning_dataset.csv')
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})
data_dict = {"text":df}
dtf_mlm = pd.DataFrame(data_dict)


# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())

In [17]:
'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

MODEL = 'bert'
bert_type = 'bert-base-uncased'

if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM

    
tokenizer = TokenizerClass.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )
model = ModelClass.from_pretrained(bert_type)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
def tokenize_function(row):
    return tokenizer(
        row['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

Map (num_proc=8):   0%|          | 0/385 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/69 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

# logging_steps=1,
training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN,
    logging_steps=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("your_path/model") #save your custom model

TypeError: forward() got an unexpected keyword argument 'labels'