In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据集
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

In [2]:
train_data

Unnamed: 0,0,1
0,还有双鸭山到淮阴的汽车票吗13号的,Travel-Query
1,从这里怎么回家,Travel-Query
2,随便播放一首专辑阁楼里的佛里的歌,Music-Play
3,给看一下墓王之王嘛,FilmTele-Play
4,我想看挑战两把s686打突变团竞的游戏视频,Video-Play
...,...,...
12095,一千六百五十三加三千一百六十五点六五等于几,Calendar-Query
12096,稍小点客厅空调风速,HomeAppliance-Control
12097,黎耀祥陈豪邓萃雯畲诗曼陈法拉敖嘉年杨怡马浚伟等到场出席,Radio-Listen
12098,百事盖世群星星光演唱会有谁,Video-Play


In [5]:
from datasets import Dataset

# pandas dataframe创建datasets.Dataset
train_data.columns = ["document", "summary"]
train_ds = Dataset.from_pandas(train_data.iloc[:-2000])
eval_ds = Dataset.from_pandas(train_data.iloc[-2000:])

train_ds

Dataset({
    features: ['document', 'summary'],
    num_rows: 10100
})

In [6]:
train_ds[0]

{'document': '还有双鸭山到淮阴的汽车票吗13号的', 'summary': 'Travel-Query'}

In [7]:
from transformers import T5Tokenizerk

pretrained_model = "/home/lyz/hf-models/IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese/"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)

ImportError: cannot import name 'T5Tokenizerk' from 'transformers' (D:\Python\envs\torch\lib\site-packages\transformers\__init__.py)

In [11]:
header_prefix = "意图识别任务: "
tail_predix = " " + "/".join(train_data['summary'].unique())

tail_predix

' Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other'

In [12]:
max_input_length = 60
max_target_length = 20

def preprocess_function(examples):
    """数据处理
    """
    # 把porefix和text处理为input
    model_inputs = tokenizer(header_prefix + examples['document'], 
                             max_length=max_input_length,
                             trunvation=True)
    
    # 把label处理为target
    labels = tokenizer(text_target=examples['summary'],
                      max_length=max_target_length,
                      truncation=True)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [13]:
train_tokenized_id = train_ds.map(preprocess_function, remove_columns=train_ds.column_names)
eval_tokenized_id = eval_ds.map(preprocess_function, remove_columns=eval_ds.column_names)

Map:   0%|          | 0/10100 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments

batch_size = 4
args = Seq2SeqTrainingArguments(
    "t5-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_gpu_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,              # 指定同时保存的模型检查点的最大数量
    gradient_accumulation_steps=10,  # 梯度累积步数，以适应较小内存环境
    do_eval=True,
    evaluation_strategy="steps",    # 评估模型，可以按照epoch或者每n步评估一次
    eval_steps=50,
    num_train_epochs=5,
    save_steps=50,
    save_on_each_node=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True  # True：在训练结束时将加载评价指标最好的模型
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_id,
    eval_dataset=eval_tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

In [None]:
trainer.train()

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# tokenize
text = "意图识别任务: 还有双鸭山到淮阴的汽车票吗13号的"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
  }

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
  # early_stopping=True,
  )

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)

In [None]:
from tqdm import tqdm_notebook

model.eval()

pred_label = []
for train_text in tqdm_notebook(test_data[0].values):
    text = f"意图识别任务: {train_text}"
    encode_dict = tokenizer(text, max_length=512, padding='max_length', truncation=True)
    
    inputs = {
      "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
      "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
    }
    
    # generate answer
    logits = model.generate(
        input_ids = inputs['input_ids'],
        max_length=100, 
        do_sample= True
    )
    
    logits = logits[:,:]
    pred_label += [tokenizer.decode(i,skip_special_tokens=True) for i in logits]

In [None]:
pd.DataFrame({
    'ID': range(1, len(pred_label) + 1),
    'Target': pred_label,
}).to_csv('nlp_submit.csv', index=None)

In [None]:
pred_label[:10]